From bd7fb9fe0264400720bd7c8612900a41198cb508 Mon Sep 17 00:00:00 2001
From: Suraj Raut <sraut@cadence.com>
Date: Thu, 28 May 2026 07:16:45 -0700
Subject: [PATCH 1/7] [Cadence: Vision] ResNet18 & ResNet50: Optimized,
 DMA-enabled, functional

- Add DMA-optimized operators: conv2d (1x1/3x3/7x7), maxpool, quantize/dequantize, relu, add, mean, softmax, linear
- Add new operators: embedding, full, im2row, quantized_fully_connected, quantized_layer_norm, quantized_matmul, requantize, view_copy
- Add vision/kernels library and quantized_ops.h header
- Add config generator for DMA buffer sizing
- Update functions_vision.yaml and CMakeLists.txt
- Add third-party XAI libraries (libxai, libxai_common, libxa_nnlib)
- FACTO submodule update
---
 .gitignore                                    |     6 +-
 CMakeLists.txt                                |    18 +-
 backends/cadence/CMakeLists.txt               |     4 +
 backends/cadence/aot/functions_vision.yaml    |    18 +-
 backends/cadence/utils/FACTO                  |     2 +-
 backends/cadence/{ => utils}/runtime/BUCK     |     0
 .../cadence/{ => utils}/runtime/__init__.py   |     0
 .../cadence/{ => utils}/runtime/et_pal.cpp    |     0
 .../cadence/{ => utils}/runtime/etdump.py     |     0
 .../cadence/{ => utils}/runtime/executor.py   |     0
 .../{ => utils}/runtime/executor_main.sh      |     0
 .../cadence/{ => utils}/runtime/runtime.py    |     0
 .../cadence/{ => utils}/runtime/targets.bzl   |     0
 backends/cadence/{ => utils}/runtime/utils.py |     0
 .../cadence/vision/config_generator/README.md |   107 +
 .../generate_combined_configs.py              |   901 +
 .../config_generator/generate_idma_buffers.py |  1478 +
 .../generate_layer_configs.py                 |  1158 +
 .../config_generator/layer_configs_16k.h      |  2403 ++
 .../config_generator/layer_configs_24k.h      |  2403 ++
 .../config_generator/layer_configs_32k.h      |  2403 ++
 .../config_generator/layer_configs_4k.h       |  2403 ++
 .../config_generator/layer_configs_61k.h      |  2403 ++
 .../config_generator/layer_configs_8k.h       |  2403 ++
 .../config_generator/layer_configs_cache.h    |  2403 ++
 .../cadence/vision/kernels/CMakeLists.txt     |     4 +-
 .../cadence/vision/operators/CMakeLists.txt   |    68 +-
 backends/cadence/vision/operators/TARGETS     |     5 +
 .../vision/operators/conv/conv_exec_1x1j1d1.c |  1023 +
 .../vision/operators/conv/conv_exec_1x1j2d1.c |  1132 +
 .../vision/operators/conv/conv_exec_3x3j1d1.c |  1030 +
 .../vision/operators/conv/conv_exec_3x3j2d1.c |  1028 +
 .../vision/operators/conv/conv_exec_7x7j2d1.c |  1088 +
 .../operators/conv/conv_kernel_dispatcher.c   |    50 +
 .../vision/operators/conv/kernel_executors.h  |   137 +
 .../cadence/vision/operators/layer_configs.h  |  2403 ++
 .../operators/maxpool/maxpool_exec_mxnj2.c    |   352 +
 .../operators/maxpool/maxpool_executors.h     |    61 +
 .../vision/operators/mean/mean_exec_dma.c     |   149 +
 .../vision/operators/mean/mean_executors.h    |    51 +
 backends/cadence/vision/operators/op_add.cpp  |   338 +-
 .../operators/op_dequantize_per_tensor.cpp    |   293 +-
 .../operators/op_max_pool2d_with_indices.cpp  |   165 +
 backends/cadence/vision/operators/op_mean.cpp |   183 +
 .../operators/op_quantize_per_tensor.cpp      |   396 +-
 .../operators/op_quantized_conv_out.cpp       |   398 +-
 .../operators/op_quantized_linear_out.cpp     |   266 +-
 .../operators/op_quantized_relu_out.cpp       |   370 +-
 .../cadence/vision/operators/op_softmax.cpp   |   261 +-
 .../cadence/vision/third-party/CMakeLists.txt |   101 +
 backends/cadence/vision/third-party/dummy.c   |    17 -
 .../cadence/vision/third-party/include/api.h  |    65 +-
 .../cadence/vision/third-party/include/dma.h  |    42 +
 .../vision/third-party/include/dtypes.h       |    43 +-
 .../vision/third-party/include/dump_tensor.h  |    70 +
 .../cadence/vision/third-party/include/lib.h  |    72 +
 .../third-party/include/memory_manager.h      |    69 +
 .../vision/third-party/include/utils.h        |   182 +
 .../third-party/include_private/common.h      |    34 +-
 .../third-party/include_private/idma_init.h   |    36 -
 .../third-party/library/api/dequantize.c      |    81 +
 .../third-party/library/api/maxpool2df.c      |   248 +
 .../vision/third-party/library/api/mean.c     |   110 +
 .../third-party/library/api/quanitze_relu.c   |   112 +
 .../third-party/library/api/quantizef.c       |    79 +
 .../vision/third-party/library/api/vaddf.c    |   124 +
 .../third-party/library/api/vdot_zeropt.c     |   123 +
 .../third-party/library/api/vsoftmaxf.c       |    58 +-
 .../cadence/vision/third-party/library/dma.c  |    62 +
 .../third-party/library/memory_manager.c      |    44 +
 .../third-party/library/tables/expf_tbl.c     |    23 +-
 .../third-party/library/tables/inff_tbl.c     |     2 +-
 .../third-party/library/tables/nanf_tbl.c     |     2 +-
 .../vision/third-party/library/utils.c        |    26 +
 .../third-party/libxai/cnn/src/cnn_conv.c     |  1668 +
 .../third-party/libxai/cnn/src/cnn_conv_MOD.c |   510 +
 .../third-party/libxai/cnn/src/cnn_conv_MOW.c |    25 +
 .../third-party/libxai/cnn/src/cnn_conv_MOW.h |   738 +
 .../third-party/libxai/cnn/src/cnn_conv_SO.c  |    27 +
 .../third-party/libxai/cnn/src/cnn_conv_SO.h  |   110 +
 .../third-party/libxai/cnn/src/cnn_conv_VQ.c  |  1371 +
 .../cnn/src/cnn_dataConversion3D_AsymQ_S8IX.h |   421 +
 .../cnn/src/cnn_dataConversion3D_I16I8.h      |   367 +
 .../cnn/src/cnn_dataConversion3D_I8I32.h      |   414 +
 .../cnn/src/cnn_dataConversion3D_S32IX.h      |   307 +
 .../libxai/cnn/src/cnn_datatransform.c        |  7835 +++++
 .../libxai/cnn/src/cnn_dilated_conv_MOD.c     |    24 +
 .../libxai/cnn/src/cnn_dilated_conv_MOD.h     | 16078 +++++++++
 .../libxai/cnn/src/cnn_dilated_conv_MOD_S16.c |    25 +
 .../libxai/cnn/src/cnn_dilated_conv_MOD_S16.h |   708 +
 .../libxai/cnn/src/cnn_dilated_conv_MOW.c     |    30 +
 .../libxai/cnn/src/cnn_dilated_conv_MOW.h     | 27240 ++++++++++++++++
 .../libxai/cnn/src/cnn_dilated_conv_MOW_S16.c |    23 +
 .../libxai/cnn/src/cnn_dilated_conv_MOW_S16.h |  2948 ++
 .../libxai/cnn/src/cnn_dilated_conv_SO.c      |    27 +
 .../libxai/cnn/src/cnn_dilated_conv_SO.h      |  1027 +
 .../libxai/cnn/src/cnn_dilated_conv_VQ_MOD.c  |    25 +
 .../cnn/src/cnn_dilated_conv_VQ_MOD_S16.c     |    25 +
 .../libxai/cnn/src/cnn_dilated_conv_VQ_MOW.c  |    30 +
 .../cnn/src/cnn_dilated_conv_VQ_MOW_S16.c     |    23 +
 .../libxai/cnn/src/cnn_dilated_conv_VQ_SO.c   |    30 +
 .../cnn/src/cnn_dilated_conv_VQ_partial_MOD.c |    22 +
 .../src/cnn_dilated_conv_VQ_partial_MOD_S16.c |    22 +
 .../cnn/src/cnn_dilated_conv_partial_MOD.c    |    22 +
 .../cnn/src/cnn_dilated_conv_partial_MOD.h    |  7858 +++++
 .../src/cnn_dilated_conv_partial_MOD_S16.c    |    22 +
 .../src/cnn_dilated_conv_partial_MOD_S16.h    |   878 +
 .../libxai/cnn/src/cnn_extend_edge.h          |  1517 +
 .../libxai/cnn/src/cnn_fill_tile.h            |   309 +
 .../third-party/libxai/cnn/src/cnn_helper.c   |  2141 ++
 .../third-party/libxai/include/xai_cnn.h      |   267 +
 .../third-party/libxai/include/xai_cnn_api.h  |  7041 ++++
 .../third-party/libxai/include/xai_intrin.h   |  1077 +
 .../include/xai_cnn_api_common.h              |   457 +
 .../include/xai_cnn_api_params.h              |  1886 ++
 .../libxai_common/include/xai_cnn_common.h    |  4329 +++
 .../libxai_common/include/xai_cnn_version.h   |    74 +
 .../libxai_common/include/xai_config_api.h    |   127 +
 .../libxai_common/include/xai_core.h          |   624 +
 .../libxai_common/include/xai_core_api.h      |   272 +
 .../libxai_common/include/xai_tile_manager.h  |  1246 +
 .../third-party/libxai_common/src/cnn_cast.c  |  1622 +
 .../third-party/libxai_common/src/cnn_cast.h  |  1890 ++
 .../libxai_common/src/cnn_cast_scalar.h       |   308 +
 .../libxai_common/src/cnn_eltwise_add.c       |   111 +
 .../libxai_common/src/cnn_eltwise_add.h       |   224 +
 .../libxai_common/src/cnn_eltwise_and.c       |    86 +
 .../libxai_common/src/cnn_eltwise_and.h       |   202 +
 .../libxai_common/src/cnn_eltwise_equal.c     |   112 +
 .../libxai_common/src/cnn_eltwise_equal.h     |   244 +
 .../src/cnn_eltwise_greaterthan.c             |   113 +
 .../src/cnn_eltwise_greaterthan.h             |   244 +
 .../libxai_common/src/cnn_eltwise_lessthan.c  |   103 +
 .../libxai_common/src/cnn_eltwise_lessthan.h  |   244 +
 .../libxai_common/src/cnn_eltwise_max.c       |   113 +
 .../libxai_common/src/cnn_eltwise_max.h       |   222 +
 .../libxai_common/src/cnn_eltwise_min.c       |   113 +
 .../libxai_common/src/cnn_eltwise_min.h       |   222 +
 .../libxai_common/src/cnn_eltwise_mul_S32.c   |   570 +
 .../libxai_common/src/cnn_eltwise_or.c        |    87 +
 .../libxai_common/src/cnn_eltwise_or.h        |   202 +
 .../libxai_common/src/cnn_eltwise_sub.c       |   112 +
 .../libxai_common/src/cnn_eltwise_sub.h       |   224 +
 .../libxai_common/src/cnn_eltwise_xor.c       |    87 +
 .../libxai_common/src/cnn_eltwise_xor.h       |   202 +
 .../libxai_common/src/xai_buildinfo.c         |    57 +
 .../libxai_common/src/xai_errstr.c            |    55 +
 147 files changed, 133082 insertions(+), 523 deletions(-)
 rename backends/cadence/{ => utils}/runtime/BUCK (100%)
 rename backends/cadence/{ => utils}/runtime/__init__.py (100%)
 rename backends/cadence/{ => utils}/runtime/et_pal.cpp (100%)
 rename backends/cadence/{ => utils}/runtime/etdump.py (100%)
 rename backends/cadence/{ => utils}/runtime/executor.py (100%)
 rename backends/cadence/{ => utils}/runtime/executor_main.sh (100%)
 rename backends/cadence/{ => utils}/runtime/runtime.py (100%)
 rename backends/cadence/{ => utils}/runtime/targets.bzl (100%)
 rename backends/cadence/{ => utils}/runtime/utils.py (100%)
 create mode 100644 backends/cadence/vision/config_generator/README.md
 create mode 100644 backends/cadence/vision/config_generator/generate_combined_configs.py
 create mode 100644 backends/cadence/vision/config_generator/generate_idma_buffers.py
 create mode 100644 backends/cadence/vision/config_generator/generate_layer_configs.py
 create mode 100644 backends/cadence/vision/config_generator/layer_configs_16k.h
 create mode 100644 backends/cadence/vision/config_generator/layer_configs_24k.h
 create mode 100644 backends/cadence/vision/config_generator/layer_configs_32k.h
 create mode 100644 backends/cadence/vision/config_generator/layer_configs_4k.h
 create mode 100644 backends/cadence/vision/config_generator/layer_configs_61k.h
 create mode 100644 backends/cadence/vision/config_generator/layer_configs_8k.h
 create mode 100644 backends/cadence/vision/config_generator/layer_configs_cache.h
 create mode 100644 backends/cadence/vision/operators/TARGETS
 create mode 100644 backends/cadence/vision/operators/conv/conv_exec_1x1j1d1.c
 create mode 100644 backends/cadence/vision/operators/conv/conv_exec_1x1j2d1.c
 create mode 100644 backends/cadence/vision/operators/conv/conv_exec_3x3j1d1.c
 create mode 100644 backends/cadence/vision/operators/conv/conv_exec_3x3j2d1.c
 create mode 100644 backends/cadence/vision/operators/conv/conv_exec_7x7j2d1.c
 create mode 100644 backends/cadence/vision/operators/conv/conv_kernel_dispatcher.c
 create mode 100644 backends/cadence/vision/operators/conv/kernel_executors.h
 create mode 100644 backends/cadence/vision/operators/layer_configs.h
 create mode 100644 backends/cadence/vision/operators/maxpool/maxpool_exec_mxnj2.c
 create mode 100644 backends/cadence/vision/operators/maxpool/maxpool_executors.h
 create mode 100644 backends/cadence/vision/operators/mean/mean_exec_dma.c
 create mode 100644 backends/cadence/vision/operators/mean/mean_executors.h
 create mode 100644 backends/cadence/vision/operators/op_max_pool2d_with_indices.cpp
 create mode 100644 backends/cadence/vision/operators/op_mean.cpp
 create mode 100644 backends/cadence/vision/third-party/CMakeLists.txt
 delete mode 100644 backends/cadence/vision/third-party/dummy.c
 create mode 100644 backends/cadence/vision/third-party/include/dma.h
 create mode 100644 backends/cadence/vision/third-party/include/dump_tensor.h
 create mode 100644 backends/cadence/vision/third-party/include/lib.h
 create mode 100644 backends/cadence/vision/third-party/include/memory_manager.h
 create mode 100644 backends/cadence/vision/third-party/include/utils.h
 delete mode 100644 backends/cadence/vision/third-party/include_private/idma_init.h
 create mode 100644 backends/cadence/vision/third-party/library/api/dequantize.c
 create mode 100644 backends/cadence/vision/third-party/library/api/maxpool2df.c
 create mode 100644 backends/cadence/vision/third-party/library/api/mean.c
 create mode 100644 backends/cadence/vision/third-party/library/api/quanitze_relu.c
 create mode 100644 backends/cadence/vision/third-party/library/api/quantizef.c
 create mode 100644 backends/cadence/vision/third-party/library/api/vaddf.c
 create mode 100644 backends/cadence/vision/third-party/library/api/vdot_zeropt.c
 create mode 100644 backends/cadence/vision/third-party/library/dma.c
 create mode 100644 backends/cadence/vision/third-party/library/memory_manager.c
 create mode 100644 backends/cadence/vision/third-party/library/utils.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOD.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOW.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOW.h
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_SO.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_SO.h
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_VQ.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_AsymQ_S8IX.h
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_I16I8.h
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_I8I32.h
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_S32IX.h
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_datatransform.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD.h
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD_S16.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD_S16.h
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW.h
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW_S16.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW_S16.h
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_SO.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_SO.h
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOD.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOD_S16.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOW.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOW_S16.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_SO.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_partial_MOD.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_partial_MOD_S16.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD.h
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD_S16.c
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD_S16.h
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_extend_edge.h
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_fill_tile.h
 create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_helper.c
 create mode 100644 backends/cadence/vision/third-party/libxai/include/xai_cnn.h
 create mode 100644 backends/cadence/vision/third-party/libxai/include/xai_cnn_api.h
 create mode 100644 backends/cadence/vision/third-party/libxai/include/xai_intrin.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/include/xai_cnn_api_common.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/include/xai_cnn_api_params.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/include/xai_cnn_common.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/include/xai_cnn_version.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/include/xai_config_api.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/include/xai_core.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/include/xai_core_api.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/include/xai_tile_manager.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_cast.c
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_cast.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_cast_scalar.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_add.c
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_add.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_and.c
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_and.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_equal.c
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_equal.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_greaterthan.c
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_greaterthan.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_lessthan.c
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_lessthan.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_max.c
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_max.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_min.c
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_min.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_mul_S32.c
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_or.c
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_or.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_sub.c
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_sub.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_xor.c
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_xor.h
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/xai_buildinfo.c
 create mode 100644 backends/cadence/vision/third-party/libxai_common/src/xai_errstr.c

diff --git a/.gitignore b/.gitignore
index 02dcea02026..1f488009a29 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,7 +32,6 @@ build-profiling/
 *.model
 *.etdump
 tokenizer.json
-*.pte
 *.ptd
 !test_bpe_tokenizer.bin
 !test_tiktoken_tokenizer.model
@@ -69,6 +68,11 @@ xcuserdata/
 /src/executorch/share/
 /src/executorch/version.py
 *_etdump
+/runtime/core/portable_type/c10/CMakeFiles/
+/runtime/core/portable_type/c10/bin/
+/runtime/core/portable_type/c10/Makefile
+/runtime/core/portable_type/c10/cmake_install.cmake
+/runtime/core/portable_type/c10/*.a
 
 # Android
 *.aar
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6467e21706e..b7d038e131a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -241,13 +241,21 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
 endif()
 
-if(EXECUTORCH_OPTIMIZE_SIZE)
-  # -Os: Optimize for size.
-  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Os")
+if(NOT EXECUTORCH_BUILD_CADENCE)
+  if(OPTIMIZE_SIZE)
+    # -Os: Optimize for size
+    set(CMAKE_CXX_FLAGS_RELEASE "-Os ${CMAKE_CXX_FLAGS_RELEASE}")
+  else()
+    # -O2: Moderate opt.
+    set(CMAKE_CXX_FLAGS_RELEASE "-O2 ${CMAKE_CXX_FLAGS_RELEASE}")
+  endif()
 else()
-  # -O2: Moderate opt.
-  set(CMAKE_CXX_FLAGS_RELEASE "-O2 ${CMAKE_CXX_FLAGS_RELEASE}")
+  set(CMAKE_CXX_FLAGS_RELEASE
+    "-O3 -mcoproc -mlongcalls -LNO:simd  -ffunction-sections -fsigned-char -fno-exceptions -INLINE:requested -fno-zero-initialized-in-bss -mtext-section-literals -fmessage-length=0")
+  set(CMAKE_C_FLAGS_RELEASE
+    "-O3 -mcoproc -mlongcalls -LNO:simd  -ffunction-sections -fsigned-char -fno-exceptions -INLINE:requested -fno-zero-initialized-in-bss -mtext-section-literals -fmessage-length=0")
 endif()
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g")
 
 if(EXECUTORCH_BUILD_TESTS)
   include(CTest)
diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt
index 271b4806614..4ae621cfe91 100644
--- a/backends/cadence/CMakeLists.txt
+++ b/backends/cadence/CMakeLists.txt
@@ -90,6 +90,10 @@ elseif(EXECUTORCH_FUSION_G3_OPT)
   )
 elseif(EXECUTORCH_VISION_OPT)
   set(TARGET_DIR vision)
+  add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party
+    ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
+  )
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
 else()
   set(TARGET_DIR generic)
diff --git a/backends/cadence/aot/functions_vision.yaml b/backends/cadence/aot/functions_vision.yaml
index cae1e0dc415..f2969a3e6d4 100644
--- a/backends/cadence/aot/functions_vision.yaml
+++ b/backends/cadence/aot/functions_vision.yaml
@@ -85,12 +85,12 @@
 - op: max_pool2d_with_indices.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::max_pool2d_with_indices_out
+      kernel_name: impl::vision::max_pool2d_with_indices_out
 
 - op: mean.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::mean_dim_out
+      kernel_name: impl::vision::mean_dim_out
 
 - op: mul.out
   kernels:
@@ -205,6 +205,16 @@
     - arg_meta: null
       kernel_name: impl::vision::quantized_conv2d_nhwc_out
 
+- func: cadence::quantized_conv2d_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::quantized_conv2d_nchw_per_tensor_out
+
+- func: cadence::quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::quantized_conv2d_nhwc_per_tensor_out
+
 - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
@@ -249,10 +259,6 @@
     - arg_meta: null
       kernel_name: impl::vision::im2row_per_tensor_out
 
-- func: cadence::quantized_conv.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
-  kernels:
-    - arg_meta: null
-      kernel_name: impl::vision::quantized_conv_per_tensor_out
 
 - func: cadence::quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
diff --git a/backends/cadence/utils/FACTO b/backends/cadence/utils/FACTO
index 3b8c778c997..1db37fc79d0 160000
--- a/backends/cadence/utils/FACTO
+++ b/backends/cadence/utils/FACTO
@@ -1 +1 @@
-Subproject commit 3b8c778c99766a8b4d0d04563ae0b16cbb276829
+Subproject commit 1db37fc79d0d59638cbb794fa49d878aafc24461
diff --git a/backends/cadence/runtime/BUCK b/backends/cadence/utils/runtime/BUCK
similarity index 100%
rename from backends/cadence/runtime/BUCK
rename to backends/cadence/utils/runtime/BUCK
diff --git a/backends/cadence/runtime/__init__.py b/backends/cadence/utils/runtime/__init__.py
similarity index 100%
rename from backends/cadence/runtime/__init__.py
rename to backends/cadence/utils/runtime/__init__.py
diff --git a/backends/cadence/runtime/et_pal.cpp b/backends/cadence/utils/runtime/et_pal.cpp
similarity index 100%
rename from backends/cadence/runtime/et_pal.cpp
rename to backends/cadence/utils/runtime/et_pal.cpp
diff --git a/backends/cadence/runtime/etdump.py b/backends/cadence/utils/runtime/etdump.py
similarity index 100%
rename from backends/cadence/runtime/etdump.py
rename to backends/cadence/utils/runtime/etdump.py
diff --git a/backends/cadence/runtime/executor.py b/backends/cadence/utils/runtime/executor.py
similarity index 100%
rename from backends/cadence/runtime/executor.py
rename to backends/cadence/utils/runtime/executor.py
diff --git a/backends/cadence/runtime/executor_main.sh b/backends/cadence/utils/runtime/executor_main.sh
similarity index 100%
rename from backends/cadence/runtime/executor_main.sh
rename to backends/cadence/utils/runtime/executor_main.sh
diff --git a/backends/cadence/runtime/runtime.py b/backends/cadence/utils/runtime/runtime.py
similarity index 100%
rename from backends/cadence/runtime/runtime.py
rename to backends/cadence/utils/runtime/runtime.py
diff --git a/backends/cadence/runtime/targets.bzl b/backends/cadence/utils/runtime/targets.bzl
similarity index 100%
rename from backends/cadence/runtime/targets.bzl
rename to backends/cadence/utils/runtime/targets.bzl
diff --git a/backends/cadence/runtime/utils.py b/backends/cadence/utils/runtime/utils.py
similarity index 100%
rename from backends/cadence/runtime/utils.py
rename to backends/cadence/utils/runtime/utils.py
diff --git a/backends/cadence/vision/config_generator/README.md b/backends/cadence/vision/config_generator/README.md
new file mode 100644
index 00000000000..c55fadcb584
--- /dev/null
+++ b/backends/cadence/vision/config_generator/README.md
@@ -0,0 +1,107 @@
+# Config Generator Python
+
+Python tools for extracting convolution layer parameters from neural network models and generating optimized C header configurations for DMA-tiled execution on the Xtensa XRC Vision DSP (XAI CNN runtime).
+
+## Prerequisites
+
+The script requires the Python venv in the executorch tree and must be run from a **bash** terminal (not csh):
+
+```bash
+# The venv is at <executorch>/.venv/
+# All paths below are relative to the executorch root.
+
+# Option 1: call the venv python directly (works from any shell)
+.venv/bin/python3 backends/cadence/vision/config_generator/generate_layer_configs.py ...
+
+# Option 2: activate the venv in a bash shell
+bash
+source .venv/bin/activate
+python3 backends/cadence/vision/config_generator/generate_layer_configs.py ...
+```
+
+> **Note:** The default terminal on this machine is `csh`. Inline python commands
+> and `source ... && ...` chains will fail in csh. Always use `bash` or invoke
+> the venv python by its full path.
+
+## Quick Start
+
+```bash
+# Run from the executorch root directory: cd <executorch>
+
+# From a single ExecuTorch .pte binary
+.venv/bin/python3 backends/cadence/vision/config_generator/generate_layer_configs.py \
+    --pte operator_and_model_testing/resnet18/pte/resnet18_quantized.pte \
+    --output backends/cadence/vision/config_generator/conv_layer_configs.h \
+    --dram0 62976 --dram1 62976
+
+# From multiple .pte files (layers are deduplicated automatically)
+.venv/bin/python3 backends/cadence/vision/config_generator/generate_layer_configs.py \
+    --pte operator_and_model_testing/resnet18/pte/resnet18_quantized.pte \
+         operator_and_model_testing/resnet50/pte/resnet50_quantized.pte \
+    --output backends/cadence/vision/config_generator/conv_layer_configs_combined.h \
+    --dram0 62976 --dram1 62976
+
+# From a torchvision model (requires torchvision installed in venv)
+.venv/bin/python3 backends/cadence/vision/config_generator/generate_layer_configs.py \
+    --model resnet18 --input-size 1,3,64,64 \
+    --output backends/cadence/vision/config_generator/conv_layer_configs.h \
+    --dram0 32768 --dram1 32768
+```
+
+### Full working commands
+
+```bash
+# cd to the executorch root first
+cd <path-to-executorch>
+
+# ResNet18 with 62976 bytes per DRAM bank
+.venv/bin/python3 backends/cadence/vision/config_generator/generate_layer_configs.py \
+    --pte operator_and_model_testing/resnet18/pte/resnet18_quantized.pte \
+    --output backends/cadence/vision/config_generator/conv_layer_configs_62k_pte.h \
+    --dram0 62976 --dram1 62976
+
+# ResNet18 + ResNet50 combined
+.venv/bin/python3 backends/cadence/vision/config_generator/generate_layer_configs.py \
+    --pte operator_and_model_testing/resnet18/pte/resnet18_quantized.pte \
+         operator_and_model_testing/resnet50/pte/resnet50_quantized.pte \
+    --output backends/cadence/vision/config_generator/conv_layer_configs_62k_combined.h \
+    --dram0 62976 --dram1 62976
+```
+
+---
+
+## `generate_layer_configs.py` — Arguments
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--model`, `-m` | — | Comma or `+`-separated torchvision model names (e.g. `resnet18+resnet50`) |
+| `--pte` | — | Path to an ExecuTorch `.pte` binary; bootstraps `exir._serialize` from the local source tree — no pip install needed |
+| `--flatc` | cmake-out default | Path to `flatc` binary (auto-detected; only relevant with `--pte`) |
+| `--input-size` | `1,3,64,64` | Input tensor shape `N,C,H,W` (only used with `--model`) |
+| `--output`, `-o` | `conv_layer_configs.h` | Output C header file |
+| `--dram0` | `32768` | DRAM0 size in bytes |
+| `--dram1` | `32768` | DRAM1 size in bytes |
+| `--cache-mode` | off | Append `_cache` to every kernel name |
+
+---
+
+## Output
+
+The generated header contains:
+
+- `conv_layer_config_t` struct with ~60 fields (buffer sizes, tile dimensions, DRAM0/1 placement, kernel name, quantization params)
+- `CONV_LAYER_CONFIGS[]` static array — one entry per unique layer
+- `get_layer_config()`, `get_layer_config_by_params()`, `get_layer_config_by_key()` inline accessors
+
+---
+
+## Directory Structure
+
+```
+config_generator_python/
+├── generate_layer_configs.py    # Main entry point
+├── generate_idma_buffers.py     # Core tiling / buffer sizing engine
+├── extract_layers_from_pte.py   # .pte/.onnx → JSON (intermediate step)
+├── config/                      # Pre-generated headers
+└── bin/                         # Compare / test utilities
+```
diff --git a/backends/cadence/vision/config_generator/generate_combined_configs.py b/backends/cadence/vision/config_generator/generate_combined_configs.py
new file mode 100644
index 00000000000..4d5ca037a65
--- /dev/null
+++ b/backends/cadence/vision/config_generator/generate_combined_configs.py
@@ -0,0 +1,901 @@
+#!/usr/bin/env python3
+"""
+Generate combined conv2d + maxpool DMA buffer configuration header from PTE files.
+
+Extracts both conv2d and maxpool layers from ExecuTorch .pte binaries and
+generates a single C header with both configuration tables and accessors.
+
+Usage:
+    # Single PTE
+    python generate_combined_configs.py \\
+        --pte resnet18_quantized.pte \\
+        --output layer_configs.h --dram0 62976 --dram1 62976
+
+    # Multiple PTE files (deduplicates automatically)
+    python generate_combined_configs.py \\
+        --pte resnet18_quantized.pte resnet50_quantized.pte \\
+        --output layer_configs.h --dram0 62976 --dram1 62976
+
+    # Force all conv kernels to no-DMA mode
+    python generate_combined_configs.py \\
+        --pte resnet18_quantized.pte \\
+        --output layer_configs.h --dram0 62976 --dram1 62976 --no-dma-mode
+"""
+
+import os
+import sys
+import json
+import argparse
+from pathlib import Path
+
+# ---------------------------------------------------------------------------
+# Resolve paths
+# ---------------------------------------------------------------------------
+_SCRIPT_DIR = Path(__file__).resolve().parent
+_EXECUTORCH_ROOT = _SCRIPT_DIR.parents[3]          # backends/cadence/vision/config_generator -> executorch/
+_EXECUTORCH_SRC  = str(_EXECUTORCH_ROOT / 'src' / 'executorch')
+_EXECUTORCH_PARENT = str(_EXECUTORCH_ROOT / 'src')
+
+# Try multiple known flatc locations
+_FLATC_CANDIDATES = [
+    _EXECUTORCH_ROOT / 'cmake-out' / 'third-party' / 'flatc_ep' / 'bin' / 'flatc',
+    _EXECUTORCH_ROOT / 'cmake-out-generic-all' / 'third-party' / 'flatc_ep' / 'bin' / 'flatc',
+    _EXECUTORCH_ROOT / 'pip-out' / 'lib.linux-x86_64-cpython-311' / 'executorch' / 'data' / 'bin' / 'flatc',
+    _EXECUTORCH_ROOT / 'pip-out' / 'temp.linux-x86_64-cpython-311' / 'cmake-out' / 'third-party' / 'flatc_ep' / 'bin' / 'flatc',
+]
+_FLATC_DEFAULT = str(next((p for p in _FLATC_CANDIDATES if p.exists()), _FLATC_CANDIDATES[0]))
+
+# Import conv buffer calculation
+sys.path.insert(0, str(_SCRIPT_DIR))
+from generate_idma_buffers import (
+    find_max_tile_config,
+    calculate_buffer_sizes_with_rows,
+    calculate_buffer_placement,
+    DRAM_SIZE_0,
+    DRAM_SIZE_1,
+)
+
+ELEMENT_SIZE_F32 = 4  # float32 bytes
+
+
+# =====================================================================
+# Bootstrap executorch imports (shared with generate_layer_configs.py)
+# =====================================================================
+
+def _bootstrap_executorch_imports(flatc_path=None):
+    import types
+    if _EXECUTORCH_PARENT not in sys.path:
+        sys.path.insert(0, _EXECUTORCH_PARENT)
+    if _EXECUTORCH_SRC not in sys.path:
+        sys.path.insert(0, _EXECUTORCH_SRC)
+    for pkg, pkg_dir in [
+        ('executorch',       _EXECUTORCH_SRC),
+        ('executorch.exir',  _EXECUTORCH_SRC + '/exir'),
+    ]:
+        if pkg not in sys.modules:
+            m = types.ModuleType(pkg)
+            m.__path__ = [pkg_dir]
+            m.__package__ = pkg
+            sys.modules[pkg] = m
+    resolved = flatc_path or _FLATC_DEFAULT
+    if os.path.isfile(resolved):
+        os.environ.setdefault('FLATC_EXECUTABLE', resolved)
+
+
+# =====================================================================
+# PTE extraction — conv2d and maxpool
+# =====================================================================
+
+def extract_layers_from_pte(pte_file, flatc_path=None):
+    """
+    Extract conv2d and maxpool layers from a .pte binary.
+
+    Returns:
+        (conv_layers, maxpool_layers)
+        Each is a list of dicts in the internal format.
+    """
+    _bootstrap_executorch_imports(flatc_path)
+
+    from executorch.exir._serialize._program import deserialize_pte_binary
+    from executorch.exir.schema import KernelCall, Int, IntList, Tensor
+
+    pte_path = Path(pte_file)
+    print(f"Loading PTE: {pte_path} ...")
+
+    with open(pte_path, 'rb') as f:
+        pte_file_obj = deserialize_pte_binary(f.read())
+
+    if hasattr(pte_file_obj, 'program'):
+        program = pte_file_obj.program
+    else:
+        program = pte_file_obj
+
+    plan   = program.execution_plan[0]
+    values = plan.values
+
+    def _tensor(idx):
+        v = values[idx].val
+        return v if isinstance(v, Tensor) else None
+
+    def _int_val(idx):
+        v = values[idx].val
+        return v.int_val if isinstance(v, Int) else None
+
+    def _intlist_val(idx):
+        v = values[idx].val
+        if isinstance(v, IntList):
+            return [_int_val(i) for i in v.items]
+        return None
+
+    CONV_OPS = {
+        'cadence::quantized_conv2d_nchw',
+        'aten::conv2d',
+        'aten::convolution',
+    }
+    MAXPOOL_OPS = {
+        'aten::max_pool2d_with_indices',
+        'aten::max_pool2d',
+    }
+
+    conv_layers = []
+    conv_seen = set()
+    maxpool_layers = []
+    maxpool_seen = set()
+
+    for instr in plan.chains[0].instructions:
+        ia = instr.instr_args
+        if not isinstance(ia, KernelCall):
+            continue
+        op_name = plan.operators[ia.op_index].name
+        args = ia.args
+
+        # --- Conv2d ---
+        if op_name in CONV_OPS:
+            input_t  = _tensor(args[0])
+            weight_t = _tensor(args[1])
+            output_t = _tensor(args[-1])
+            if input_t is None or weight_t is None or output_t is None:
+                continue
+
+            stride   = _intlist_val(args[3]) or [1, 1]
+            padding  = _intlist_val(args[4]) or [0, 0]
+            dilation = _intlist_val(args[5]) or [1, 1]
+
+            _, in_c,  in_h,  in_w  = input_t.sizes
+            _, out_c, out_h, out_w = output_t.sizes
+            _oc, _ic, k_h, k_w    = weight_t.sizes
+
+            info = {
+                'input':   (in_w,  in_h,  in_c),
+                'output':  (out_w, out_h, out_c),
+                'kernel':  (k_w,   k_h,   _ic,   _oc),
+                'stride':  tuple(stride),
+                'padding': tuple(padding),
+                'dilation':tuple(dilation),
+            }
+            key = (info['input'], info['output'], info['kernel'],
+                   info['stride'], info['padding'], info['dilation'])
+            if key not in conv_seen:
+                conv_seen.add(key)
+                conv_layers.append(info)
+
+        # --- MaxPool ---
+        elif op_name in MAXPOOL_OPS:
+            input_t  = _tensor(args[0])
+            # max_pool2d_with_indices: input, kernel_size, stride, padding, dilation, ceil_mode, output, indices
+            # max_pool2d:              input, kernel_size, stride, padding, dilation, ceil_mode, output
+            if input_t is None:
+                continue
+
+            kernel_size = _intlist_val(args[1]) or [2, 2]
+            mp_stride   = _intlist_val(args[2]) or kernel_size
+            mp_padding  = _intlist_val(args[3]) or [0, 0]
+
+            _, C, H, W = input_t.sizes
+            kh, kw = kernel_size[0], kernel_size[1]
+            sh, sw = mp_stride[0], mp_stride[1]
+            ph, pw = mp_padding[0], mp_padding[1]
+
+            mp_key = (C, H, W, kh, kw, sh, sw, ph, pw)
+            if mp_key not in maxpool_seen:
+                maxpool_seen.add(mp_key)
+                maxpool_layers.append({
+                    'name': f"maxpool_{kh}x{kw}s{sh}_c{C}_{H}x{W}",
+                    'src_width': W,
+                    'src_height': H,
+                    'channels': C,
+                    'kernel_h': kh,
+                    'kernel_w': kw,
+                    'stride_h': sh,
+                    'stride_w': sw,
+                    'pad_h': ph,
+                    'pad_w': pw,
+                })
+
+    # Convert conv to internal format
+    conv_result = []
+    for layer_id, info in enumerate(conv_layers):
+        in_w, in_h, in_c = info['input']
+        out_w, out_h, out_c = info['output']
+        k_w, k_h, _ic, _oc = info['kernel']
+        name = f"conv_{k_h}x{k_w}_s{info['stride'][0]}_ic{in_c}_oc{out_c}"
+        conv_result.append({
+            'layer_id': layer_id,
+            'name': name,
+            'input': info['input'],
+            'output': info['output'],
+            'kernel': info['kernel'],
+            'stride': info['stride'],
+            'padding': info['padding'],
+            'dilation': info['dilation'],
+        })
+
+    print(f"  Extracted {len(conv_result)} conv layers, {len(maxpool_layers)} maxpool layers")
+    return conv_result, maxpool_layers
+
+
+# =====================================================================
+# Conv config calculation (reused from generate_layer_configs.py)
+# =====================================================================
+
+def calculate_conv_config(layer, dram0_size, dram1_size):
+    """Calculate complete conv config dict for one layer.
+    Mirrors calculate_layer_config() in generate_layer_configs.py."""
+    in_w, in_h, in_c = layer['input']
+    out_w, out_h, out_c = layer['output']
+    k_w, k_h, _, _ = layer['kernel']
+    stride_w, stride_h = layer['stride']
+    pad = layer['padding'][0]
+    dil = layer['dilation'][0]
+    pad_w = pad_h = pad
+
+    padding = (pad_w, pad_w, pad_h, pad_h, 0, 0)
+    conv_params = (stride_w, stride_h, 8, 4000, 11, 0, 1, k_h, k_w)
+
+    # Kernel name
+    if k_h == 7 and k_w == 7 and stride_h == 2:
+        kernel_name = "7x7j2d1"
+    elif k_h == 3 and k_w == 3 and stride_h == 1:
+        kernel_name = "3x3j1d1"
+    elif k_h == 3 and k_w == 3 and stride_h == 2:
+        kernel_name = "3x3j2d1"
+    elif k_h == 1 and k_w == 1 and stride_h == 2:
+        kernel_name = "1x1j2d1"
+    elif k_h == 1 and k_w == 1 and stride_h == 1:
+        kernel_name = "1x1j1d1"
+    else:
+        kernel_name = f"{k_w}x{k_h}j{stride_w}d1"
+
+    n_tile_size, output_rows, buffer_sizes = find_max_tile_config(
+        input_whd=(in_w, in_h, in_c),
+        output_whd=(out_w, out_h, out_c),
+        kernel_whdn=(k_w, k_h, in_c, out_c),
+        padding=padding,
+        stride_xy=(stride_w, stride_h),
+        kernel_name=kernel_name,
+        data_type="S8S8",
+        dram0_size=dram0_size,
+        dram1_size=dram1_size,
+        conv_params=conv_params,
+    )
+
+    if buffer_sizes is None or n_tile_size == 0 or output_rows == 0:
+        # No-DMA fallback
+        in_dim1_pitch = in_w + 2 * pad_w
+        in_dim2_pitch = in_dim1_pitch * (in_h + 2 * pad_h)
+        out_dim1_pitch = out_w
+        out_dim2_pitch = out_dim1_pitch * out_h
+        coeff_dim1_pitch = k_w
+        coeff_dim2_pitch = coeff_dim1_pitch * k_h
+        coeff_dim3_pitch = coeff_dim2_pitch * in_c
+
+        return {
+            'layer_id': layer['layer_id'], 'layer_name': layer['name'],
+            'kernel_name': kernel_name + "_no_dma",
+            'src_dim1_size': in_w, 'src_dim2_size': in_h, 'src_dim3_size': in_c,
+            'src_dim1_pitch': in_w, 'src_dim2_pitch': in_w * in_h,
+            'dst_dim1_size': out_w, 'dst_dim2_size': out_h, 'dst_dim3_size': out_c,
+            'dst_dim1_pitch': out_w, 'dst_dim2_pitch': out_w * out_h,
+            'in_dim1_size': in_w, 'in_dim1_pitch': in_dim1_pitch,
+            'in_dim2_size': in_h, 'in_dim2_pitch': in_dim2_pitch,
+            'in_dim1_edge1': pad_w, 'in_dim1_edge2': pad_w,
+            'in_dim2_edge1': pad_h, 'in_dim2_edge2': pad_h,
+            'in_dim3_edge1': 0, 'in_dim3_edge2': 0,
+            'in_data_offset': 0, 'in_rows_firstdma': in_h,
+            'out_dim1_size': out_w, 'out_dim1_pitch': out_dim1_pitch,
+            'out_dim2_size': out_h, 'out_dim2_pitch': out_dim2_pitch,
+            'out_dim3_size': out_c,
+            'coeff_dim1_size': k_w, 'coeff_dim2_size': k_h,
+            'coeff_dim3_size': in_c, 'coeff_dim4_size': out_c,
+            'coeff_dim1_pitch': coeff_dim1_pitch, 'coeff_dim2_pitch': coeff_dim2_pitch,
+            'coeff_dim3_pitch': coeff_dim3_pitch,
+            'bias_dim1_size': out_c, 'bias_dim2_size': 1,
+            'outscale_dim1_size': out_c, 'outscale_dim2_size': 1,
+            'input_buffer_size': in_dim2_pitch * in_c,
+            'coeff_buffer_size': coeff_dim3_pitch * out_c,
+            'output_buffer_size': out_dim2_pitch * out_c,
+            'bias_buffer_size': out_c * 4, 'outscale_buffer_size': out_c * 2,
+            'input_ping_dram': 0, 'input_pong_dram': 0, 'coeff_dram': 0,
+            'output_ping_dram': 0, 'output_pong_dram': 0,
+            'bias_dram': 0, 'outscale_dram': 0,
+            'n_tile_size': out_c, 'n_tiles': 1, 'n_tile_size_last': out_c,
+            'height_tiles': 1, 'output_rows': out_h, 'input_rows': in_h,
+            'kernel_w': k_w, 'kernel_h': k_h,
+            'stride_x': stride_w, 'stride_y': stride_h,
+            'padding': pad_w, 'dilation': 1,
+            'accum_shift': 8, 'relu_max': 4000, 'relu_min': 0,
+            'output_shift': 11, 'output_scale': 0, 'flags': 0,
+            'input_zero_point': 0,
+            'config_key': f"{in_c}_{in_h}_{in_w}_{out_c}_{k_h}_{k_w}_{out_h}_{out_w}_{stride_h}_{stride_w}_{pad_w}_1",
+        }
+
+    # DMA mode — use buffer_sizes dict from find_max_tile_config
+    n_tiles = (out_c + n_tile_size - 1) // n_tile_size
+    height_tiles = (out_h + output_rows - 1) // output_rows
+    input_rows = k_h + (output_rows - 1) * stride_h
+
+    placement = calculate_buffer_placement(buffer_sizes, dram0_size, dram1_size)
+
+    dilation = buffer_sizes.get('DILATION', 1)
+    config = {
+        'layer_id': layer['layer_id'], 'layer_name': layer['name'],
+        'kernel_name': kernel_name + "_dma",
+        'src_dim1_size': buffer_sizes['SRC_DIM1_SIZE'],
+        'src_dim2_size': buffer_sizes['SRC_DIM2_SIZE'],
+        'src_dim3_size': buffer_sizes['SRC_DIM3_SIZE'],
+        'src_dim1_pitch': buffer_sizes['SRC_DIM1_PITCH'],
+        'src_dim2_pitch': buffer_sizes['SRC_DIM2_PITCH'],
+        'dst_dim1_size': buffer_sizes['DST_DIM1_SIZE'],
+        'dst_dim2_size': buffer_sizes['DST_DIM2_SIZE'],
+        'dst_dim3_size': out_c,
+        'dst_dim1_pitch': buffer_sizes['DST_DIM1_PITCH'],
+        'dst_dim2_pitch': buffer_sizes['DST_DIM2_PITCH'],
+        'in_dim1_size': buffer_sizes['IN_DIM1_SIZE'],
+        'in_dim1_pitch': buffer_sizes['IN_DIM1_PITCH'],
+        'in_dim2_size': buffer_sizes['IN_DIM2_SIZE'],
+        'in_dim2_pitch': buffer_sizes['IN_DIM2_PITCH'],
+        'in_dim1_edge1': padding[0], 'in_dim1_edge2': padding[1],
+        'in_dim2_edge1': padding[2], 'in_dim2_edge2': padding[3],
+        'in_dim3_edge1': padding[4], 'in_dim3_edge2': padding[5],
+        'in_data_offset': buffer_sizes['IN_DATA_OFFSET'],
+        'in_rows_firstdma': buffer_sizes['IN_ROWS_FIRSTDMA'],
+        'out_dim1_size': buffer_sizes['OUT_DIM1_SIZE'],
+        'out_dim1_pitch': buffer_sizes['OUT_DIM1_PITCH'],
+        'out_dim2_size': buffer_sizes['OUT_DIM2_SIZE'],
+        'out_dim2_pitch': buffer_sizes['OUT_DIM2_PITCH'],
+        'out_dim3_size': buffer_sizes['OUT_DIM3_SIZE'],
+        'coeff_dim1_size': buffer_sizes['COEFF_DIM1_SIZE'],
+        'coeff_dim2_size': buffer_sizes['COEFF_DIM2_SIZE'],
+        'coeff_dim3_size': buffer_sizes['COEFF_DIM3_SIZE'],
+        'coeff_dim4_size': buffer_sizes['COEFF_DIM4_SIZE'],
+        'coeff_dim1_pitch': buffer_sizes['COEFF_DIM1_PITCH'],
+        'coeff_dim2_pitch': buffer_sizes['COEFF_DIM2_PITCH'],
+        'coeff_dim3_pitch': buffer_sizes['COEFF_DIM3_PITCH'],
+        'bias_dim1_size': buffer_sizes['BIAS_DIM1_SIZE'],
+        'bias_dim2_size': buffer_sizes['BIAS_DIM2_SIZE'],
+        'outscale_dim1_size': buffer_sizes['OUTSCALE_DIM1_SIZE'],
+        'outscale_dim2_size': buffer_sizes['OUTSCALE_DIM2_SIZE'],
+        'input_buffer_size': buffer_sizes['IN'],
+        'coeff_buffer_size': buffer_sizes['COEFF'],
+        'output_buffer_size': buffer_sizes['OUT'],
+        'bias_buffer_size': buffer_sizes['BIAS'],
+        'outscale_buffer_size': buffer_sizes['OUTSCALE'],
+        'input_ping_dram': placement.get('IN1_dram', 0),
+        'input_pong_dram': placement.get('IN2_dram', 1),
+        'coeff_dram': placement.get('COEFF_dram', 0),
+        'output_ping_dram': placement.get('OUT1_dram', 1),
+        'output_pong_dram': placement.get('OUT2_dram', 1),
+        'bias_dram': placement.get('BIAS_dram', 1),
+        'outscale_dram': placement.get('OUTSCALE_dram', 1),
+        'n_tile_size': buffer_sizes['N_TILE_SIZE'],
+        'n_tiles': buffer_sizes['N_TILES'],
+        'n_tile_size_last': buffer_sizes['N_TILE_SIZE_LAST'],
+        'height_tiles': buffer_sizes['HIGHT_TILES'],
+        'output_rows': output_rows,
+        'input_rows': input_rows,
+        'stride_x': buffer_sizes.get('STRIDEX', stride_w),
+        'stride_y': buffer_sizes.get('STRIDEY', stride_h),
+        'accum_shift': buffer_sizes.get('ACCUM_SHIFT', 8),
+        'relu_max': buffer_sizes.get('RELU_MAX', 4000),
+        'relu_min': buffer_sizes.get('RELU_MIN', 0),
+        'output_shift': buffer_sizes.get('OUTPUT_SHIFT', 11),
+        'output_scale': buffer_sizes.get('OUTPUT_SCALE', 0),
+        'dilation': dilation,
+        'kernel_w': k_w, 'kernel_h': k_h,
+        'padding': pad_w, 'flags': buffer_sizes.get('FLAGS', 0),
+        'input_zero_point': 0,
+        'config_key': f"{in_c}_{in_h}_{in_w}_{out_c}_{k_h}_{k_w}_{out_h}_{out_w}_{stride_h}_{stride_w}_{pad_w}_{dilation}",
+    }
+    return config
+
+
+# =====================================================================
+# Maxpool config calculation (reused from generate_maxpool_configs.py)
+# =====================================================================
+
+def calculate_maxpool_buffers(layer, c_tile_size, output_rows):
+    W  = layer['src_width']
+    H  = layer['src_height']
+    kh = layer['kernel_h']
+    kw = layer['kernel_w']
+    sh = layer['stride_h']
+    sw = layer['stride_w']
+    ph = layer['pad_h']
+    pw = layer['pad_w']
+
+    dst_w = (W + 2 * pw - kw) // sw + 1
+    dst_h = (H + 2 * ph - kh) // sh + 1
+
+    input_rows   = (output_rows - 1) * sh + kh
+    in_tile_w    = W + 2 * pw
+    in_tile_rows = input_rows + 2 * ph
+    in_tile_plane = in_tile_w * in_tile_rows
+    in_data_offset = ph * in_tile_w + pw
+
+    out_tile_plane = dst_w * output_rows
+    input_buf  = c_tile_size * in_tile_plane * ELEMENT_SIZE_F32
+    output_buf = c_tile_size * out_tile_plane * ELEMENT_SIZE_F32
+
+    C = layer['channels']
+    c_tiles = (C + c_tile_size - 1) // c_tile_size
+    c_tile_last = C - c_tile_size * (c_tiles - 1)
+    height_tiles = (dst_h + output_rows - 1) // output_rows
+
+    return {
+        'dst_width': dst_w, 'dst_height': dst_h,
+        'input_rows': input_rows,
+        'in_tile_w': in_tile_w, 'in_tile_rows': in_tile_rows,
+        'in_tile_plane': in_tile_plane, 'in_data_offset': in_data_offset,
+        'out_tile_w': dst_w, 'out_tile_rows': output_rows,
+        'out_tile_plane': out_tile_plane,
+        'c_tile_size': c_tile_size, 'c_tiles': c_tiles,
+        'c_tile_size_last': c_tile_last,
+        'height_tiles': height_tiles, 'output_rows': output_rows,
+        'input_buffer_size': input_buf, 'output_buffer_size': output_buf,
+    }
+
+
+def find_maxpool_tiling(layer, dram0_size, dram1_size):
+    C = layer['channels']
+    dst_h = ((layer['src_height'] + 2 * layer['pad_h'] - layer['kernel_h'])
+             // layer['stride_h'] + 1)
+    bank = min(dram0_size, dram1_size)
+
+    best_c, best_r, best_buf = 0, 0, None
+    for c in range(C, 0, -1):
+        for r in range(dst_h, 0, -1):
+            buf = calculate_maxpool_buffers(layer, c, r)
+            if buf['input_buffer_size'] + buf['output_buffer_size'] <= bank:
+                if (c > best_c) or (c == best_c and r > best_r):
+                    best_c, best_r, best_buf = c, r, buf
+                break
+        if best_c == C:
+            break
+    return best_c, best_r, best_buf
+
+
+def build_maxpool_config(layer_id, layer, dram0_size, dram1_size):
+    c_tile, out_rows, buf = find_maxpool_tiling(layer, dram0_size, dram1_size)
+    if buf is None:
+        dst_h = ((layer['src_height'] + 2 * layer['pad_h'] - layer['kernel_h'])
+                 // layer['stride_h'] + 1)
+        c_tile = layer['channels']
+        out_rows = dst_h
+        buf = calculate_maxpool_buffers(layer, c_tile, out_rows)
+
+    W = layer['src_width']
+    H = layer['src_height']
+    C = layer['channels']
+
+    cfg = {
+        'layer_id': layer_id,
+        'layer_name': layer.get('name', f"maxpool_{layer_id}"),
+        'config_key': f"{C}_{H}_{W}_{layer['kernel_h']}_{layer['kernel_w']}_"
+                      f"{layer['stride_h']}_{layer['stride_w']}_"
+                      f"{layer['pad_h']}_{layer['pad_w']}",
+        'src_width': W, 'src_height': H, 'channels': C,
+        'dst_width': buf['dst_width'], 'dst_height': buf['dst_height'],
+        'src_row_pitch': W, 'src_plane_pitch': H * W,
+        'dst_row_pitch': buf['dst_width'],
+        'dst_plane_pitch': buf['dst_height'] * buf['dst_width'],
+        'kernel_h': layer['kernel_h'], 'kernel_w': layer['kernel_w'],
+        'stride_h': layer['stride_h'], 'stride_w': layer['stride_w'],
+        'pad_h': layer['pad_h'], 'pad_w': layer['pad_w'],
+        'in_tile_w': buf['in_tile_w'], 'in_tile_rows': buf['in_tile_rows'],
+        'in_tile_plane': buf['in_tile_plane'],
+        'in_data_offset': buf['in_data_offset'],
+        'out_tile_w': buf['out_tile_w'], 'out_tile_rows': buf['out_tile_rows'],
+        'out_tile_plane': buf['out_tile_plane'],
+        'c_tile_size': buf['c_tile_size'], 'c_tiles': buf['c_tiles'],
+        'c_tile_size_last': buf['c_tile_size_last'],
+        'height_tiles': buf['height_tiles'],
+        'output_rows': buf['output_rows'], 'input_rows': buf['input_rows'],
+        'input_buffer_size': buf['input_buffer_size'],
+        'output_buffer_size': buf['output_buffer_size'],
+        'input_ping_dram': 0, 'input_pong_dram': 1,
+        'output_ping_dram': 1, 'output_pong_dram': 0,
+    }
+    return cfg
+
+
+# =====================================================================
+# Combined C header generation
+# =====================================================================
+
+def generate_combined_header(conv_configs, maxpool_configs, output_file,
+                             dram0_size, dram1_size, no_dma_mode=False):
+    _dram0 = 0 if no_dma_mode else dram0_size
+    _dram1 = 0 if no_dma_mode else dram1_size
+
+    with open(output_file, 'w') as f:
+        f.write("""\
+/*
+ * layer_configs.h
+ *
+ * Auto-generated conv2d + maxpool layer configurations
+ * Generated from PTE extraction by generate_combined_configs.py
+ *
+ * DO NOT EDIT MANUALLY
+ */
+
+#ifndef LAYER_CONFIGS_H
+#define LAYER_CONFIGS_H
+
+#include <stdint.h>
+#include <stddef.h>  /* for NULL */
+
+""")
+        # ----------------------------------------------------------
+        # DRAM macros
+        # ----------------------------------------------------------
+        f.write(f"#define IDMA_BUFFER_SIZE_DRAM0 ({_dram0})  /* {_dram0 // 1024} KB */\n")
+        f.write(f"#define IDMA_BUFFER_SIZE_DRAM1 ({_dram1})  /* {_dram1 // 1024} KB */\n\n")
+
+        # ===========================================================
+        # CONV SECTION
+        # ===========================================================
+        f.write("/* " + "=" * 70 + " */\n")
+        f.write("/*  Conv2d configurations                                              */\n")
+        f.write("/* " + "=" * 70 + " */\n\n")
+
+        f.write("""\
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* kernel_name;
+    const char* config_key;
+
+    int src_dim1_size;  int src_dim2_size;  int src_dim3_size;
+    int src_dim1_pitch; int src_dim2_pitch;
+
+    int dst_dim1_size;  int dst_dim2_size;  int dst_dim3_size;
+    int dst_dim1_pitch; int dst_dim2_pitch;
+
+    int in_dim1_size;   int in_dim1_pitch;
+    int in_dim2_size;   int in_dim2_pitch;
+    int in_dim1_edge1;  int in_dim1_edge2;
+    int in_dim2_edge1;  int in_dim2_edge2;
+    int in_dim3_edge1;  int in_dim3_edge2;
+    int in_data_offset; int in_rows_firstdma;
+
+    int out_dim1_size;  int out_dim1_pitch;
+    int out_dim2_size;  int out_dim2_pitch;
+    int out_dim3_size;
+
+    int coeff_dim1_size;  int coeff_dim2_size;
+    int coeff_dim3_size;  int coeff_dim4_size;
+    int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch;
+
+    int bias_dim1_size;     int bias_dim2_size;
+    int outscale_dim1_size; int outscale_dim2_size;
+
+    int input_buffer_size;  int coeff_buffer_size;  int output_buffer_size;
+    int bias_buffer_size;   int outscale_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;  int coeff_dram;
+    int output_ping_dram; int output_pong_dram;
+    int bias_dram;        int outscale_dram;
+
+    int n_tile_size; int n_tiles; int n_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int kernel_w; int kernel_h;
+    int stride_x; int stride_y;
+    int padding;  int dilation;
+    int accum_shift; int relu_max; int relu_min;
+    int output_shift; int output_scale; int flags;
+    int input_zero_point;
+} conv_layer_config_t;
+
+""")
+        f.write(f"#define NUM_CONV_LAYERS {len(conv_configs)}\n\n")
+        f.write("static const conv_layer_config_t CONV_LAYER_CONFIGS[] = {\n")
+
+        conv_fields = [
+            'layer_id', 'layer_name', 'kernel_name', 'config_key',
+            'src_dim1_size', 'src_dim2_size', 'src_dim3_size', 'src_dim1_pitch', 'src_dim2_pitch',
+            'dst_dim1_size', 'dst_dim2_size', 'dst_dim3_size', 'dst_dim1_pitch', 'dst_dim2_pitch',
+            'in_dim1_size', 'in_dim1_pitch', 'in_dim2_size', 'in_dim2_pitch',
+            'in_dim1_edge1', 'in_dim1_edge2', 'in_dim2_edge1', 'in_dim2_edge2',
+            'in_dim3_edge1', 'in_dim3_edge2', 'in_data_offset', 'in_rows_firstdma',
+            'out_dim1_size', 'out_dim1_pitch', 'out_dim2_size', 'out_dim2_pitch', 'out_dim3_size',
+            'coeff_dim1_size', 'coeff_dim2_size', 'coeff_dim3_size', 'coeff_dim4_size',
+            'coeff_dim1_pitch', 'coeff_dim2_pitch', 'coeff_dim3_pitch',
+            'bias_dim1_size', 'bias_dim2_size', 'outscale_dim1_size', 'outscale_dim2_size',
+            'input_buffer_size', 'coeff_buffer_size', 'output_buffer_size',
+            'bias_buffer_size', 'outscale_buffer_size',
+            'input_ping_dram', 'input_pong_dram', 'coeff_dram',
+            'output_ping_dram', 'output_pong_dram', 'bias_dram', 'outscale_dram',
+            'n_tile_size', 'n_tiles', 'n_tile_size_last',
+            'height_tiles', 'output_rows', 'input_rows',
+            'kernel_w', 'kernel_h', 'stride_x', 'stride_y', 'padding', 'dilation',
+            'accum_shift', 'relu_max', 'relu_min', 'output_shift', 'output_scale', 'flags',
+            'input_zero_point',
+        ]
+        str_fields = {'layer_name', 'kernel_name', 'config_key'}
+
+        for cfg in conv_configs:
+            f.write("    {\n")
+            for fld in conv_fields:
+                val = cfg[fld]
+                if fld in str_fields:
+                    f.write(f"        .{fld} = \"{val}\",\n")
+                else:
+                    f.write(f"        .{fld} = {val},\n")
+            f.write("    },\n")
+        f.write("};\n\n")
+
+        # Conv accessors
+        f.write("""\
+static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; }
+
+static inline const conv_layer_config_t* get_conv_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL;
+    return &CONV_LAYER_CONFIGS[layer_id];
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_params(
+    int ic, int ih, int iw,
+    int oc, int kh, int kw,
+    int oh, int ow,
+    int sy, int sx,
+    int pad, int dil)
+{
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->src_dim3_size == ic &&
+            cfg->src_dim2_size == ih &&
+            cfg->src_dim1_size == iw &&
+            cfg->dst_dim3_size == oc &&
+            cfg->coeff_dim2_size == kh &&
+            cfg->coeff_dim1_size == kw &&
+            cfg->dst_dim2_size == oh &&
+            cfg->dst_dim1_size == ow &&
+            cfg->stride_y == sy &&
+            cfg->stride_x == sx &&
+            cfg->padding == pad &&
+            cfg->dilation == dil)
+            return cfg;
+    }
+    return NULL;
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) {
+    if (config_key == NULL) return NULL;
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->config_key != NULL) {
+            const char* a = config_key;
+            const char* b = cfg->config_key;
+            while (*a && *b && *a == *b) { a++; b++; }
+            if (*a == '\\0' && *b == '\\0') return cfg;
+        }
+    }
+    return NULL;
+}
+
+""")
+
+        # ===========================================================
+        # MAXPOOL SECTION
+        # ===========================================================
+        f.write("/* " + "=" * 70 + " */\n")
+        f.write("/*  MaxPool configurations                                             */\n")
+        f.write("/* " + "=" * 70 + " */\n\n")
+
+        f.write("""\
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* config_key;
+
+    int src_width;  int src_height;  int channels;
+    int dst_width;  int dst_height;
+
+    int src_row_pitch;  int src_plane_pitch;
+    int dst_row_pitch;  int dst_plane_pitch;
+
+    int kernel_h;  int kernel_w;
+    int stride_h;  int stride_w;
+    int pad_h;     int pad_w;
+
+    int in_tile_w;      int in_tile_rows;   int in_tile_plane;
+    int in_data_offset;
+    int out_tile_w;     int out_tile_rows;  int out_tile_plane;
+
+    int c_tile_size;  int c_tiles;  int c_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int input_buffer_size;  int output_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;
+    int output_ping_dram; int output_pong_dram;
+} maxpool_layer_config_t;
+
+""")
+        f.write(f"#define NUM_MAXPOOL_LAYERS {len(maxpool_configs)}\n\n")
+        f.write("static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = {\n")
+
+        mp_fields = [
+            'layer_id', 'layer_name', 'config_key',
+            'src_width', 'src_height', 'channels',
+            'dst_width', 'dst_height',
+            'src_row_pitch', 'src_plane_pitch', 'dst_row_pitch', 'dst_plane_pitch',
+            'kernel_h', 'kernel_w', 'stride_h', 'stride_w', 'pad_h', 'pad_w',
+            'in_tile_w', 'in_tile_rows', 'in_tile_plane', 'in_data_offset',
+            'out_tile_w', 'out_tile_rows', 'out_tile_plane',
+            'c_tile_size', 'c_tiles', 'c_tile_size_last',
+            'height_tiles', 'output_rows', 'input_rows',
+            'input_buffer_size', 'output_buffer_size',
+            'input_ping_dram', 'input_pong_dram', 'output_ping_dram', 'output_pong_dram',
+        ]
+        mp_str_fields = {'layer_name', 'config_key'}
+
+        for cfg in maxpool_configs:
+            f.write("    {\n")
+            for fld in mp_fields:
+                val = cfg[fld]
+                if fld in mp_str_fields:
+                    f.write(f"        .{fld} = \"{val}\",\n")
+                else:
+                    f.write(f"        .{fld} = {val},\n")
+            f.write("    },\n")
+        f.write("};\n\n")
+
+        # Maxpool accessors
+        f.write("""\
+static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; }
+
+static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL;
+    return &MAXPOOL_LAYER_CONFIGS[layer_id];
+}
+
+static inline const maxpool_layer_config_t* get_maxpool_config_by_params(
+    int channels, int src_height, int src_width,
+    int kernel_h, int kernel_w,
+    int stride_h, int stride_w,
+    int pad_h, int pad_w)
+{
+    for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) {
+        const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i];
+        if (c->channels   == channels   &&
+            c->src_height == src_height &&
+            c->src_width  == src_width  &&
+            c->kernel_h   == kernel_h   &&
+            c->kernel_w   == kernel_w   &&
+            c->stride_h   == stride_h   &&
+            c->stride_w   == stride_w   &&
+            c->pad_h      == pad_h      &&
+            c->pad_w      == pad_w)
+            return c;
+    }
+    return NULL;
+}
+
+#endif /* LAYER_CONFIGS_H */
+""")
+
+    print(f"\nGenerated {output_file}")
+    print(f"  Conv layers:    {len(conv_configs)}")
+    print(f"  Maxpool layers: {len(maxpool_configs)}")
+
+
+# =====================================================================
+# CLI
+# =====================================================================
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate combined conv2d + maxpool DMA config header from PTE files')
+    parser.add_argument('--pte', nargs='+', required=True,
+                        help='One or more ExecuTorch .pte files')
+    parser.add_argument('--output', '-o', default='layer_configs.h',
+                        help='Output C header file (default: layer_configs.h)')
+    parser.add_argument('--dram0', type=int, default=62976,
+                        help='DRAM0 size in bytes (default: 62976)')
+    parser.add_argument('--dram1', type=int, default=62976,
+                        help='DRAM1 size in bytes (default: 62976)')
+    parser.add_argument('--flatc', default=None,
+                        help='Path to flatc binary (auto-detected)')
+    parser.add_argument('--no-dma-mode', action='store_true', default=False,
+                        help='Force all conv kernels to no-DMA mode')
+
+    args = parser.parse_args()
+
+    # Collect layers from all PTE files with deduplication
+    all_conv = []
+    all_maxpool = []
+    conv_seen = set()
+    mp_seen = set()
+
+    for pte_path_str in args.pte:
+        pte_path = Path(pte_path_str)
+        if not pte_path.exists():
+            print(f"ERROR: PTE file not found: {pte_path}")
+            return 1
+
+        print(f"\nExtracting from: {pte_path}")
+        conv_layers, mp_layers = extract_layers_from_pte(pte_path, flatc_path=args.flatc)
+
+        for l in conv_layers:
+            key = (l['input'], l['output'], l['kernel'],
+                   l['stride'], l['padding'], l['dilation'])
+            if key not in conv_seen:
+                conv_seen.add(key)
+                l['layer_id'] = len(all_conv)
+                all_conv.append(l)
+            else:
+                print(f"  [skip dup conv] {l['name']}")
+
+        for l in mp_layers:
+            key = (l['channels'], l['src_height'], l['src_width'],
+                   l['kernel_h'], l['kernel_w'], l['stride_h'], l['stride_w'],
+                   l['pad_h'], l['pad_w'])
+            if key not in mp_seen:
+                mp_seen.add(key)
+                all_maxpool.append(l)
+            else:
+                print(f"  [skip dup maxpool] {l['name']}")
+
+    print(f"\nTotal unique: {len(all_conv)} conv, {len(all_maxpool)} maxpool")
+    print(f"DRAM budget: DRAM0={args.dram0}B  DRAM1={args.dram1}B")
+
+    # Calculate conv configs
+    print(f"\nCalculating conv configurations...")
+    conv_configs = []
+    for layer in all_conv:
+        print(f"  Conv {layer['layer_id']}: {layer['name']}...")
+        cfg = calculate_conv_config(layer, args.dram0, args.dram1)
+        if cfg:
+            conv_configs.append(cfg)
+            print(f"    [OK] n_tile={cfg['n_tile_size']}, height_tiles={cfg['height_tiles']}, "
+                  f"output_rows={cfg['output_rows']}")
+        else:
+            print(f"    [FAIL]")
+
+    # Apply no-DMA mode
+    if args.no_dma_mode:
+        for cfg in conv_configs:
+            if cfg['kernel_name'].endswith('_dma'):
+                cfg['kernel_name'] = cfg['kernel_name'][:-4] + '_no_dma'
+        print("No-DMA mode: all conv kernels set to _no_dma")
+
+    # Calculate maxpool configs
+    print(f"\nCalculating maxpool configurations...")
+    mp_configs = []
+    for idx, layer in enumerate(all_maxpool):
+        print(f"  Maxpool {idx}: {layer['name']}...")
+        cfg = build_maxpool_config(idx, layer, args.dram0, args.dram1)
+        mp_configs.append(cfg)
+        print(f"    [OK] c_tile={cfg['c_tile_size']}, height_tiles={cfg['height_tiles']}, "
+              f"output_rows={cfg['output_rows']}")
+
+    # Generate combined header
+    generate_combined_header(conv_configs, mp_configs, args.output,
+                             args.dram0, args.dram1, args.no_dma_mode)
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main() or 0)
diff --git a/backends/cadence/vision/config_generator/generate_idma_buffers.py b/backends/cadence/vision/config_generator/generate_idma_buffers.py
new file mode 100644
index 00000000000..3dbcd9cfe17
--- /dev/null
+++ b/backends/cadence/vision/config_generator/generate_idma_buffers.py
@@ -0,0 +1,1478 @@
+#!/usr/bin/env python3
+"""
+Generate IDMA buffer size definitions for convolution operations.
+
+This script calculates buffer sizes based on:
+- Processing all width elements in one go
+- Processing 2 output rows in one go
+- Processing all output channels in one go
+"""
+
+# DRAM Size Configuration (in bytes)
+DRAM_SIZE_0 = 32 * 1024  # 128 KB for DRAM0
+DRAM_SIZE_1 = 32 * 1024  # 64 KB for DRAM1
+def find_max_tile_config(input_whd, output_whd, kernel_whdn, padding, stride_xy, kernel_name="7x7j2d1", data_type="S8S8", dram0_size=None, dram1_size=None, conv_params=None, conv_flags=0):
+    """
+    Find maximum output channels and output rows that fit in available DRAM.
+    
+    Strategy:
+    1. Start with n_tile_size=1, output_rows=1
+    2. Increase n_tile_size until all output channels are covered or memory is full
+    3. Once all channels fit, increase output_rows
+    
+    Args:
+        input_whd: Tuple (width, height, depth) of input
+        output_whd: Tuple (width, height, depth) of output
+        kernel_whdn: Tuple (width, height, depth, num_filters) of kernel
+        padding: Tuple (dim1_edge1, dim1_edge2, dim2_edge1, dim2_edge2, dim3_edge1, dim3_edge2)
+        stride_xy: Tuple (stride_x, stride_y)
+        kernel_name: String identifier for kernel
+        data_type: Data type string
+        dram0_size: Size of DRAM0 in bytes
+        dram1_size: Size of DRAM1 in bytes
+        conv_params: Tuple of (strideX, strideY, accumShift, reluMax, outputShift, outputScale, dilation, kernelHeight, kernelWidth)
+        conv_flags: Integer flags value (e.g., CNN_CONV_FLAG_RELU)
+    
+    Returns:
+        Tuple (best_n_tile_size, best_output_rows, buffer_sizes_dict)
+    """
+    if dram0_size is None:
+        dram0_size = DRAM_SIZE_0
+    if dram1_size is None:
+        dram1_size = DRAM_SIZE_1
+    
+    output_w, output_h, output_d = output_whd
+    kernel_w, kernel_h, kernel_d, kernel_n = kernel_whdn
+    stride_x, stride_y = stride_xy
+    
+    print(f"\n=== Finding Maximum Tile Configuration ===")
+    print(f"Kernel: {kernel_name}")
+    print(f"Total output channels: {kernel_n}")
+    print(f"DRAM0: {dram0_size} bytes, DRAM1: {dram1_size} bytes")
+    print()
+    
+    best_n_tile_size = 1
+    best_output_rows = 2  # Minimum output rows should be 2
+    best_buffer_sizes = None
+    all_channels_fit = False
+    best_tile_balance = float('inf')  # Track tile size balance (lower is better)
+    
+    # Phase 1: Scan ALL n_tile_size values to find best balanced config
+    current_output_rows = 2
+    last_fit_n_tile_size = 0
+    
+    for n_tile_size in range(1, kernel_n + 1):
+        # Temporarily modify output_whd for this iteration
+        temp_output_whd = (output_w, output_h, output_d)
+        
+        # Calculate buffer sizes with current configuration
+        buffer_sizes = calculate_buffer_sizes_with_rows(
+            input_whd, temp_output_whd, kernel_whdn, padding, stride_xy,
+            kernel_name, data_type, n_tile_size, current_output_rows,
+            conv_params, conv_flags
+        )
+        
+        # Check if it fits in DRAM
+        placement = calculate_buffer_placement(buffer_sizes, dram0_size, dram1_size)
+        
+        if placement['total_fits']:
+            last_fit_n_tile_size = n_tile_size
+            
+            # Calculate tile balance: difference between first tile and last tile
+            n_tiles = (kernel_n + n_tile_size - 1) // n_tile_size
+            last_tile_size = kernel_n - (n_tile_size * (n_tiles - 1))
+            tile_balance = abs(n_tile_size - last_tile_size)
+            
+            # Update best if this is more balanced than current best
+            # OR if balance is same but tile size is larger (prefer fewer, larger tiles)
+            if tile_balance < best_tile_balance or \
+               (tile_balance == best_tile_balance and n_tile_size > best_n_tile_size):
+                best_n_tile_size = n_tile_size
+                best_output_rows = current_output_rows
+                best_buffer_sizes = buffer_sizes
+                best_tile_balance = tile_balance
+                
+                if n_tile_size >= kernel_n:
+                    all_channels_fit = True
+                    print(f"  All {kernel_n} output channels fit with {current_output_rows} output rows")
+                    break
+        else:
+            # Stop scanning if we've found at least one config and current doesn't fit
+            if last_fit_n_tile_size > 0:
+                print(f"  n_tile_size={n_tile_size}, output_rows={current_output_rows}: Does NOT fit")
+                break
+    
+    # Phase 2: If all channels fit, try increasing output_rows
+    if all_channels_fit:
+        print(f"\n  Phase 2: Increasing output rows (all channels fit)...")
+        
+        for output_rows in range(3, output_h + 1):
+            buffer_sizes = calculate_buffer_sizes_with_rows(
+                input_whd, temp_output_whd, kernel_whdn, padding, stride_xy,
+                kernel_name, data_type, best_n_tile_size, output_rows,
+                conv_params, conv_flags
+            )
+            
+            placement = calculate_buffer_placement(buffer_sizes, dram0_size, dram1_size)
+            
+            if placement['total_fits']:
+                best_output_rows = output_rows
+                best_buffer_sizes = buffer_sizes
+                print(f"  output_rows={output_rows}: Fits!")
+            else:
+                print(f"  output_rows={output_rows}: Does NOT fit")
+                break
+    
+    print(f"\n=== Best Configuration Found ===")
+    if best_buffer_sizes is None:
+        print(f"\033[91m  ERROR: No configuration fits in available DRAM!\033[0m")
+        print(f"\033[91m  DRAM0: {dram0_size} bytes, DRAM1: {dram1_size} bytes\033[0m")
+        print(f"\033[91m  Minimum required: n_tile_size=1, output_rows=1\033[0m")
+        print(f"\033[91m  Setting all buffer sizes to 0 for kernel {kernel_name}\033[0m")
+        
+        # Create a minimal buffer_sizes dict with all zeros
+        best_buffer_sizes = {
+            'IN': 0, 'COEFF': 0, 'COEFF_TILE_SIZE_LAST': 0, 'OUT': 0, 'BIAS': 0, 'OUTSCALE': 0,
+            'padding': padding, 'kernel_name': kernel_name, 'data_type': data_type,
+            'SRC_DIM1_SIZE': 0, 'SRC_DIM1_PITCH': 0, 'SRC_DIM2_SIZE': 0, 'SRC_DIM2_PITCH': 0, 'SRC_DIM3_SIZE': 0,
+            'DST_DIM1_SIZE': 0, 'DST_DIM1_PITCH': 0, 'DST_DIM2_SIZE': 0, 'DST_DIM2_PITCH': 0,
+            'IN_DIM1_SIZE': 0, 'IN_DIM1_PITCH': 0, 'IN_DIM2_SIZE': 0, 'IN_DIM2_PITCH': 0,
+            'IN_DATA_OFFSET': 0, 'IN_ROWS_FIRSTDMA': 0,
+            'OUT_DIM1_SIZE': 0, 'OUT_DIM1_PITCH': 0, 'OUT_DIM2_SIZE': 0, 'OUT_DIM2_PITCH': 0, 'OUT_DIM3_SIZE': 0,
+            'COEFF_DIM1_SIZE': 0, 'COEFF_DIM2_SIZE': 0, 'COEFF_DIM3_SIZE': 0, 'COEFF_DIM4_SIZE': 0,
+            'COEFF_DIM1_PITCH': 0, 'COEFF_DIM2_PITCH': 0, 'COEFF_DIM3_PITCH': 0,
+            'BIAS_DIM1_SIZE': 0, 'BIAS_DIM2_SIZE': 0,
+            'OUTSCALE_DIM1_SIZE': 0, 'OUTSCALE_DIM2_SIZE': 0,
+            'N_TILE_SIZE': 0, 'N_TILES': 0, 'N_TILE_SIZE_LAST': 0, 'HIGHT_TILES': 0,
+            'details': {'input_buff_whd': (0, 0, 0), 'input_rows_needed': 0, 'output_buff_whd': (0, 0, 0)}
+        }
+        best_n_tile_size = 0
+        best_output_rows = 0
+    
+    print(f"  n_tile_size: {best_n_tile_size} (out of {kernel_n} total channels)")
+    print(f"  output_rows: {best_output_rows}")
+    print()
+    
+    return best_n_tile_size, best_output_rows, best_buffer_sizes
+
+def calculate_buffer_sizes_with_rows(input_whd, output_whd, kernel_whdn, padding, stride_xy, kernel_name="7x7j2d1", data_type="S8S8", n_tile_size=None, output_rows_per_iteration=2, conv_params=None, conv_flags=0):
+    """
+    Calculate IDMA buffer sizes with configurable output rows per iteration.
+    
+    Args:
+        Same as calculate_buffer_sizes, plus:
+        output_rows_per_iteration: Number of output rows to process in one iteration
+        conv_params: Tuple of (strideX, strideY, accumShift, reluMax, outputShift, outputScale, dilation, kernelHeight, kernelWidth)
+        conv_flags: Integer flags value (e.g., CNN_CONV_FLAG_RELU)
+    
+    Returns:
+        Dictionary with buffer sizes
+    """
+    input_w, input_h, input_d = input_whd
+    output_w, output_h, output_d = output_whd
+    kernel_w, kernel_h, kernel_d, kernel_n = kernel_whdn
+    dim1_edge1, dim1_edge2, dim2_edge1, dim2_edge2, dim3_edge1, dim3_edge2 = padding
+    stride_x, stride_y = stride_xy
+    
+    # Calculate input tile dimensions
+    # For width (DIM1): we always process full input width (no horizontal tiling)
+    input_dim1_size = input_w  # Full width, no horizontal tiling
+    
+    # Calculate input buffer size
+    # For N output rows, we need enough input rows to cover them with the kernel
+    input_rows_needed = (output_rows_per_iteration - 1) * stride_y + kernel_h
+    
+    # Input buffer dimensions (WHD format)
+    input_buff_w = input_dim1_size + dim1_edge1 + dim1_edge2
+    input_buff_h = input_rows_needed
+    input_buff_d = input_d + dim3_edge1 + dim3_edge2
+    
+    # Input buffer size in bytes
+    input_buff_size = input_buff_w * input_buff_h * input_buff_d
+    
+    # Tiling parameters
+    if n_tile_size is None:
+        n_tile_size_val = kernel_n
+        n_tiles = 1
+        n_tile_size_last = kernel_n
+    else:
+        n_tile_size_val = n_tile_size
+        n_tiles = (kernel_n + n_tile_size - 1) // n_tile_size
+        n_tile_size_last = kernel_n - (n_tile_size * (n_tiles - 1))
+    
+    # Coefficient buffer size
+    coeff_buff_size = kernel_w * kernel_h * kernel_d * n_tile_size_val
+    
+    # Coefficient tile size for last tile
+    coeff_tile_size_last = kernel_w * kernel_h * kernel_d * n_tile_size_last
+    
+    # Calculate output buffer size
+    output_buff_w = output_w
+    output_buff_h = output_rows_per_iteration
+    output_buff_d = n_tile_size_val
+    output_buff_size = output_buff_w * output_buff_h * output_buff_d
+    
+    # Bias and outscale buffers
+    bias_buff_size = kernel_n * 4  # S32
+    outscale_buff_size = kernel_n * 2  # U16
+    
+    # Calculate tile dimensions and pitches
+# Calculate tile dimensions and pitches
+    src_dim1_size = input_w
+    src_dim1_pitch = input_w
+    src_dim2_size = input_h
+    src_dim2_pitch = input_w * input_h
+    src_dim3_size = input_d
+    
+    dst_dim1_size = output_w
+    dst_dim1_pitch = output_w
+    dst_dim2_size = output_h
+    dst_dim2_pitch = output_w * output_h
+    
+    in_dim1_size = input_dim1_size
+    in_dim1_pitch = input_buff_w  # DIM1_PITCH = row size + dim1 padding
+    in_dim2_size = input_rows_needed
+    in_dim2_pitch = in_dim2_size * in_dim1_pitch  # DIM2_PITCH = DIM2_SIZE * DIM1_PITCH
+    
+    in_data_offset = (dim2_edge1 * in_dim1_pitch) + dim1_edge1
+    in_rows_firstdma = input_rows_needed - dim2_edge1
+    
+    out_dim1_size = output_w
+    out_dim1_pitch = output_w
+    out_dim2_size = output_rows_per_iteration
+    out_dim2_pitch = output_w * output_rows_per_iteration
+    out_dim3_size = n_tile_size_val
+    
+    coeff_dim1_size = kernel_w
+    coeff_dim2_size = kernel_h
+    coeff_dim3_size = kernel_d
+    coeff_dim4_size = kernel_n
+    coeff_dim1_pitch = kernel_w
+    coeff_dim2_pitch = kernel_w * kernel_h
+    coeff_dim3_pitch = kernel_w * kernel_h * kernel_d
+    
+    bias_dim1_size = kernel_n
+    bias_dim2_size = 1
+    
+    outscale_dim1_size = kernel_n
+    outscale_dim2_size = 1
+    
+    height_tiles = (output_h + output_rows_per_iteration - 1) // output_rows_per_iteration
+    
+    result = {
+        'IN': input_buff_size,
+        'COEFF': coeff_buff_size,
+        'COEFF_TILE_SIZE_LAST': coeff_tile_size_last,
+        'OUT': output_buff_size,
+        'BIAS': bias_buff_size,
+        'OUTSCALE': outscale_buff_size,
+        'padding': padding,
+        'kernel_name': kernel_name,
+        'data_type': data_type,
+        'SRC_DIM1_SIZE': src_dim1_size,
+        'SRC_DIM1_PITCH': src_dim1_pitch,
+        'SRC_DIM2_SIZE': src_dim2_size,
+        'SRC_DIM2_PITCH': src_dim2_pitch,
+        'SRC_DIM3_SIZE': src_dim3_size,
+        'DST_DIM1_SIZE': dst_dim1_size,
+        'DST_DIM1_PITCH': dst_dim1_pitch,
+        'DST_DIM2_SIZE': dst_dim2_size,
+        'DST_DIM2_PITCH': dst_dim2_pitch,
+        'IN_DIM1_SIZE': in_dim1_size,
+        'IN_DIM1_PITCH': in_dim1_pitch,
+        'IN_DIM2_SIZE': in_dim2_size,
+        'IN_DIM2_PITCH': in_dim2_pitch,
+        'IN_DATA_OFFSET': in_data_offset,
+        'IN_ROWS_FIRSTDMA': in_rows_firstdma,
+        'OUT_DIM1_SIZE': out_dim1_size,
+        'OUT_DIM1_PITCH': out_dim1_pitch,
+        'OUT_DIM2_SIZE': out_dim2_size,
+        'OUT_DIM2_PITCH': out_dim2_pitch,
+        'OUT_DIM3_SIZE': out_dim3_size,
+        'COEFF_DIM1_SIZE': coeff_dim1_size,
+        'COEFF_DIM2_SIZE': coeff_dim2_size,
+        'COEFF_DIM3_SIZE': coeff_dim3_size,
+        'COEFF_DIM4_SIZE': coeff_dim4_size,
+        'COEFF_DIM1_PITCH': coeff_dim1_pitch,
+        'COEFF_DIM2_PITCH': coeff_dim2_pitch,
+        'COEFF_DIM3_PITCH': coeff_dim3_pitch,
+        'BIAS_DIM1_SIZE': bias_dim1_size,
+        'BIAS_DIM2_SIZE': bias_dim2_size,
+        'OUTSCALE_DIM1_SIZE': outscale_dim1_size,
+        'OUTSCALE_DIM2_SIZE': outscale_dim2_size,
+        'N_TILE_SIZE': n_tile_size_val,
+        'N_TILES': n_tiles,
+        'N_TILE_SIZE_LAST': n_tile_size_last,
+        'HIGHT_TILES': height_tiles,
+        'details': {
+            'input_buff_whd': (input_buff_w, input_buff_h, input_buff_d),
+            'input_rows_needed': input_rows_needed,
+            'output_buff_whd': (output_buff_w, output_buff_h, output_buff_d),
+        }
+    }
+    
+    # Add convolution parameters if provided
+    if conv_params is not None:
+        strideX, strideY, accumShift, reluMax, outputShift, outputScale, dilation, kernelHeight, kernelWidth = conv_params
+        result.update({
+            'STRIDEX': strideX,
+            'STRIDEY': strideY,
+            'ACCUM_SHIFT': accumShift,
+            'RELU_MAX': reluMax,
+            'RELU_MIN': 0,  # Default minimum
+            'OUTPUT_SHIFT': outputShift,
+            'OUTPUT_SCALE': outputScale,
+            'DILATION': dilation,
+            'KERNEL_HEIGHT': kernelHeight,
+            'KERNEL_WIDTH': kernelWidth,
+            'FLAGS': conv_flags,
+        })
+    
+    return result
+
+def calculate_buffer_sizes(input_whd, output_whd, kernel_whdn, padding, stride_xy, kernel_name="7x7j2d1", data_type="S8S8", n_tile_size=None):
+    """
+    Calculate IDMA buffer sizes for convolution (uses default 2 output rows).
+    
+    This is a wrapper around calculate_buffer_sizes_with_rows with output_rows=2.
+    """
+    return calculate_buffer_sizes_with_rows(input_whd, output_whd, kernel_whdn, padding, stride_xy, kernel_name, data_type, n_tile_size, output_rows_per_iteration=2)
+    """
+    Calculate IDMA buffer sizes for convolution.
+    
+    Args:
+        input_whd: Tuple (width, height, depth) of input
+        output_whd: Tuple (width, height, depth) of output
+        kernel_whdn: Tuple (width, height, depth, num_filters) of kernel
+        padding: Tuple (dim1_edge1, dim1_edge2, dim2_edge1, dim2_edge2, dim3_edge1, dim3_edge2)
+        stride_xy: Tuple (stride_x, stride_y)
+        kernel_name: String identifier for kernel (e.g., "7x7j2d1")
+        data_type: Data type string (e.g., "S8S8")
+        n_tile_size: Number of output channels per tile (None = all channels)
+    
+    Returns:
+        Dictionary with buffer sizes
+    """
+    input_w, input_h, input_d = input_whd
+    output_w, output_h, output_d = output_whd
+    kernel_w, kernel_h, kernel_d, kernel_n = kernel_whdn
+    dim1_edge1, dim1_edge2, dim2_edge1, dim2_edge2, dim3_edge1, dim3_edge2 = padding
+    stride_x, stride_y = stride_xy
+    
+    # Assumptions:
+    # - Process all width elements in one go
+    # - Process 2 output rows in one go
+    # - Process all output channels in one go
+    
+    output_rows_per_iteration = 2
+    
+    # Calculate input tile dimensions
+    # DIM1_SIZE for input processing
+    input_dim1_size = input_w - stride_x + 1
+    
+    # Calculate input buffer size
+    # For 2 output rows, we need enough input rows to cover them with the kernel
+    input_rows_needed = (output_rows_per_iteration - 1) * stride_y + kernel_h
+    
+    # Input buffer dimensions (WHD format)
+    # Width includes padding
+    input_buff_w = input_dim1_size + dim1_edge1 + dim1_edge2
+    # Height is just the rows needed (padding is NOT added to height for buffer calculation)
+    input_buff_h = input_rows_needed
+    # Depth includes padding
+    input_buff_d = input_d + dim3_edge1 + dim3_edge2
+    
+    # Input buffer size in bytes (assuming 1 byte per element for S8)
+    input_buff_size = input_buff_w * input_buff_h * input_buff_d
+    
+    # Tiling parameters (calculate early to use in buffer sizes)
+    if n_tile_size is None:
+        # No tiling - process all output channels at once
+        n_tile_size_val = kernel_n
+        n_tiles = 1
+        n_tile_size_last = kernel_n
+    else:
+        # Calculate number of tiles needed
+        n_tile_size_val = n_tile_size
+        n_tiles = (kernel_n + n_tile_size - 1) // n_tile_size  # Ceiling division
+        n_tile_size_last = kernel_n - (n_tile_size * (n_tiles - 1))
+    
+    # Coefficient buffer size (only one tile worth of coefficients at a time)
+    coeff_buff_size = kernel_w * kernel_h * kernel_d * n_tile_size_val
+    
+    # Calculate coefficient tile size for last tile
+    coeff_tile_size_last = kernel_w * kernel_h * kernel_d * n_tile_size_last
+    
+    # Calculate output buffer size (WHD format)
+    # Process all width, 2 rows, n_tile_size channels
+    output_buff_w = output_w
+    output_buff_h = output_rows_per_iteration
+    output_buff_d = n_tile_size_val  # Use tile size instead of total output channels
+    output_buff_size = output_buff_w * output_buff_h * output_buff_d
+    
+    # Calculate bias buffer size (S32 = 4 bytes per element)
+    # One bias value per output channel (for all channels, not just one tile)
+    bias_buff_size = kernel_n * 4  # S32 uses 4 bytes
+    
+    # Calculate output scale buffer size (U16 = 2 bytes per element)
+    # One scale value per output channel (for all channels, not just one tile)
+    outscale_buff_size = kernel_n * 2  # U16 uses 2 bytes
+    
+    # Calculate tile dimensions and pitches
+    # Source tile parameters (original input dimensions)
+    src_dim1_size = input_w
+    src_dim1_pitch = input_w
+    src_dim2_size = input_h
+    src_dim2_pitch = input_w * input_h
+    src_dim3_size = input_d
+    
+    # Destination tile parameters (original output dimensions)
+    dst_dim1_size = output_w
+    dst_dim1_pitch = output_w
+    dst_dim2_size = output_h
+    dst_dim2_pitch = output_w * output_h
+    
+    # Input tile (WHD format with padding)
+    input_dim1_size = input_w - stride_x + 1  # Width for single tile processing
+    input_dim1_pitch = input_dim1_size + dim1_edge1 + dim1_edge2  # Width with padding
+    input_dim2_size = input_rows_needed  # Number of input rows (kernel_h + (output_rows-1)*stride_y)
+    input_dim2_pitch = input_dim1_pitch * input_rows_needed  # Pitch for next depth plane (rows × width)
+    
+    # Calculate data offset (padding offset in the buffer)
+    # Offset = (top_padding_rows * pitch) + left_padding_pixels
+    input_data_offset = (dim2_edge1 * input_dim1_pitch) + dim1_edge1
+    
+    # Calculate rows for first DMA (excludes top padding)
+    input_rows_firstdma = input_rows_needed - dim2_edge1
+    
+    # Output tile (WHD format)
+    output_dim1_size = output_buff_w
+    output_dim1_pitch = output_buff_w
+    output_dim2_size = output_rows_per_iteration
+    output_dim2_pitch = output_rows_per_iteration * output_dim1_pitch  # Output rows in one go × width
+    output_dim3_size = n_tile_size_val  # Use tile size
+    
+    # Coefficient tile parameters (WHDN format)
+    coeff_dim1_size = kernel_w
+    coeff_dim2_size = kernel_h
+    coeff_dim3_size = kernel_d
+    coeff_dim4_size = kernel_n
+    coeff_dim1_pitch = kernel_w
+    coeff_dim2_pitch = kernel_w * kernel_h
+    coeff_dim3_pitch = kernel_w * kernel_h * kernel_d
+    
+    # Bias array parameters
+    bias_dim1_size = kernel_n
+    bias_dim2_size = 1
+    
+    # Output scale array parameters
+    outscale_dim1_size = kernel_n
+    outscale_dim2_size = 1
+    
+    # Height tiles (number of iterations for output height)
+    height_tiles = output_h // output_rows_per_iteration
+    
+    return {
+        'IN': input_buff_size,
+        'COEFF': coeff_buff_size,
+        'COEFF_TILE_SIZE_LAST': coeff_tile_size_last,
+        'OUT': output_buff_size,
+        'BIAS': bias_buff_size,
+        'OUTSCALE': outscale_buff_size,
+        'kernel_name': kernel_name,
+        'data_type': data_type,
+        'padding': padding,
+        'SRC_DIM1_SIZE': src_dim1_size,
+        'SRC_DIM1_PITCH': src_dim1_pitch,
+        'SRC_DIM2_SIZE': src_dim2_size,
+        'SRC_DIM2_PITCH': src_dim2_pitch,
+        'SRC_DIM3_SIZE': src_dim3_size,
+        'DST_DIM1_SIZE': dst_dim1_size,
+        'DST_DIM1_PITCH': dst_dim1_pitch,
+        'DST_DIM2_SIZE': dst_dim2_size,
+        'DST_DIM2_PITCH': dst_dim2_pitch,
+        'DIM1_SIZE': input_dim1_size,
+        'DIM1_PITCH': input_dim1_pitch,
+        'DIM2_SIZE': input_dim2_size,
+        'DIM2_PITCH': input_dim2_pitch,
+        'IN_DATA_OFFSET': input_data_offset,
+        'IN_ROWS_FIRSTDMA': input_rows_firstdma,
+        'OUT_DIM1_SIZE': output_dim1_size,
+        'OUT_DIM1_PITCH': output_dim1_pitch,
+        'OUT_DIM2_SIZE': output_dim2_size,
+        'OUT_DIM2_PITCH': output_dim2_pitch,
+        'OUT_DIM3_SIZE': output_dim3_size,
+        'COEFF_DIM1_SIZE': coeff_dim1_size,
+        'COEFF_DIM2_SIZE': coeff_dim2_size,
+        'COEFF_DIM3_SIZE': coeff_dim3_size,
+        'COEFF_DIM4_SIZE': coeff_dim4_size,
+        'COEFF_DIM1_PITCH': coeff_dim1_pitch,
+        'COEFF_DIM2_PITCH': coeff_dim2_pitch,
+        'COEFF_DIM3_PITCH': coeff_dim3_pitch,
+        'BIAS_DIM1_SIZE': bias_dim1_size,
+        'BIAS_DIM2_SIZE': bias_dim2_size,
+        'OUTSCALE_DIM1_SIZE': outscale_dim1_size,
+        'OUTSCALE_DIM2_SIZE': outscale_dim2_size,
+        'N_TILE_SIZE': n_tile_size_val,
+        'N_TILES': n_tiles,
+        'N_TILE_SIZE_LAST': n_tile_size_last,
+        'HIGHT_TILES': height_tiles,
+        'details': {
+            'input_buff_whd': (input_buff_w, input_buff_h, input_buff_d),
+            'input_rows_needed': input_rows_needed,
+            'output_buff_whd': (output_buff_w, output_buff_h, output_buff_d),
+        }
+    }
+
+
+def generate_header_content(buffer_sizes, header_guard="CONVIDMA_BUFFERS_H_"):
+    """
+    Generate C header file content with buffer size definitions.
+    
+    Args:
+        buffer_sizes: Dictionary from calculate_buffer_sizes()
+        header_guard: Header guard name
+    
+    Returns:
+        String containing header file content
+    """
+    kernel_name = buffer_sizes['kernel_name']
+    data_type = buffer_sizes['data_type']
+    
+    header = f"""/*
+ * convIdma_buffers.h
+ *
+ *  Auto-generated buffer size definitions
+ */
+
+#ifndef {header_guard}
+#define {header_guard}
+
+// ============================================================================
+// IDMA Buffer Sizes and Tile Parameters for convVQ3D_{kernel_name}_{data_type}_MOW_WHD
+// ============================================================================
+
+// SRC tile parameters
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM1_SIZE     {buffer_sizes['SRC_DIM1_SIZE']} // input width
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM1_PITCH    {buffer_sizes['SRC_DIM1_PITCH']} //
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM2_SIZE     {buffer_sizes['SRC_DIM2_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM2_PITCH    {buffer_sizes['SRC_DIM2_PITCH']} // {buffer_sizes['SRC_DIM1_SIZE']}*{buffer_sizes['SRC_DIM2_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM3_SIZE     {buffer_sizes['SRC_DIM3_SIZE']}
+
+// DST   tile parameters
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM1_SIZE     {buffer_sizes['DST_DIM1_SIZE']} // input width
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM1_PITCH    {buffer_sizes['DST_DIM1_PITCH']} //
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM2_SIZE     {buffer_sizes['DST_DIM2_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM2_PITCH    {buffer_sizes['DST_DIM2_PITCH']} // {buffer_sizes['DST_DIM1_SIZE']}*{buffer_sizes['DST_DIM2_SIZE']}
+
+
+// Input tile parameters
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM1_SIZE     {buffer_sizes['IN_DIM1_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM1_PITCH    {buffer_sizes['IN_DIM1_PITCH']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM2_SIZE     {buffer_sizes['IN_DIM2_SIZE']} // IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_SIZE + ((IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_SIZE-1)* stride)
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM2_PITCH    {buffer_sizes['IN_DIM2_PITCH']}
+
+
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_DATA_OFFSET   {buffer_sizes['IN_DATA_OFFSET']}
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_ROWS_FIRSTDMA {buffer_sizes['IN_ROWS_FIRSTDMA']}  // IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM2_SIZE - padding rows 
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_FRAME_PTR 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_STATUS_FLAGS 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_DIM1_COORD 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_DIM3_COORD 0
+
+// Output tile parameters
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_SIZE     {buffer_sizes['OUT_DIM1_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_PITCH    {buffer_sizes['OUT_DIM1_PITCH']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_SIZE     {buffer_sizes['OUT_DIM2_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_PITCH    {buffer_sizes['OUT_DIM2_PITCH']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM3_SIZE     {buffer_sizes['N_TILE_SIZE']} //IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_FRAME_PTR 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_STATUS_FLAGS 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_COORD 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_EDGE1 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_EDGE2 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_EDGE1 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_EDGE2 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM3_COORD 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM3_EDGE1 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM3_EDGE2 0
+
+//coefficient tile parameters
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_SIZE     {buffer_sizes['COEFF_DIM1_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_SIZE     {buffer_sizes['COEFF_DIM2_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_SIZE     {buffer_sizes['COEFF_DIM3_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM4_SIZE     {buffer_sizes['COEFF_DIM4_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_PITCH    {buffer_sizes['COEFF_DIM1_PITCH']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_PITCH    {buffer_sizes['COEFF_DIM2_PITCH']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_PITCH    {buffer_sizes['COEFF_DIM3_PITCH']}
+
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_FRAME_PTR 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_STATUS_FLAGS 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_COORD 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_EDGE1 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_EDGE2 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_COORD 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_EDGE1 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_EDGE2 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_COORD 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_EDGE1 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_EDGE2 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM4_COORD 0
+
+//bias array parameters
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_BIAS_DIM1_SIZE       {buffer_sizes['BIAS_DIM1_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_BIAS_DIM2_SIZE       {buffer_sizes['BIAS_DIM2_SIZE']}       
+
+//output scale array parameters
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUTSCALE_DIM1_SIZE     {buffer_sizes['OUTSCALE_DIM1_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUTSCALE_DIM2_SIZE     {buffer_sizes['OUTSCALE_DIM2_SIZE']}
+
+// Buffer sizes
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN       {buffer_sizes['IN']}
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF    {buffer_sizes['COEFF']}
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT      {buffer_sizes['OUT']}
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_BIAS     {buffer_sizes['BIAS']}
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUTSCALE {buffer_sizes['OUTSCALE']}
+
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILES {buffer_sizes['N_TILES']}           // round_toward positive(IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM3_SIZE / IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE)
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_HIGHT_TILES {buffer_sizes['HIGHT_TILES']}     //IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM2_SIZE / IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_SIZE   
+#define  IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE {buffer_sizes['N_TILE_SIZE']}    // take this as input aas of now (contstant 22 for 3x3 conv and constant 64 for 7x7 conv)
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE_LAST {buffer_sizes['N_TILE_SIZE_LAST']} //IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM4_SIZE - IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE
+
+#endif /* {header_guard} */
+"""
+    return header
+
+
+def align_to_64(size):
+    """Round up size to next 64-byte boundary for alignment."""
+    return ((size + 63) // 64) * 64
+
+
+def calculate_buffer_placement(buffer_sizes, dram0_size=32*1024, dram1_size=32*1024):
+    """
+    Calculate optimal buffer placement in DRAM0 and DRAM1 for ping-pong architecture.
+    
+    Strategy:
+    1. Try default placement: input/coeff in DRAM0, output/bias/outscale in DRAM1
+    2. If DRAM0 overflows, move coefficient to DRAM1 (if it fits)
+    3. If DRAM1 overflows, move bias/outscale to DRAM0 (if it fits)
+    4. Report best fit or overflow scenario
+    
+    Note: All buffer sizes are aligned to 64 bytes to account for alignment overhead.
+    
+    Args:
+        buffer_sizes: Dictionary from calculate_buffer_sizes()
+        dram0_size: Size of DRAM0 in bytes (default 32KB)
+        dram1_size: Size of DRAM1 in bytes (default: use global DRAM_SIZE_1)
+    
+    Returns:
+        Dictionary with buffer placement information
+    """
+    # Use global DRAM sizes if not specified
+    if dram0_size is None:
+        dram0_size = DRAM_SIZE_0
+    if dram1_size is None:
+        dram1_size = DRAM_SIZE_1
+    
+    # Ping-pong buffers require 2x allocation
+    # Align each buffer to 64 bytes to account for alignment overhead
+    input_ping = align_to_64(buffer_sizes['IN'])
+    input_pong = align_to_64(buffer_sizes['IN'])
+    coeff = align_to_64(buffer_sizes['COEFF'])
+    output_ping = align_to_64(buffer_sizes['OUT'])
+    output_pong = align_to_64(buffer_sizes['OUT'])
+    bias = align_to_64(buffer_sizes['BIAS'])
+    outscale = align_to_64(buffer_sizes['OUTSCALE'])
+    
+    # CRITICAL: Check if any single buffer exceeds DRAM bank size
+    # A single buffer cannot be split across banks, so each must fit individually
+    max_bank_size = max(dram0_size, dram1_size)
+    if coeff > max_bank_size:
+        # Coefficient buffer too large - cannot fit in any single DRAM bank
+        return {
+            'strategy': 'FAIL_COEFF_TOO_LARGE',
+            'dram0_allocation': [],
+            'dram1_allocation': [],
+            'dram0_used': 0,
+            'dram1_used': 0,
+            'dram0_size': dram0_size,
+            'dram1_size': dram1_size,
+            'dram0_free': dram0_size,
+            'dram1_free': dram1_size,
+            'dram0_fits': False,
+            'dram1_fits': False,
+            'total_fits': False,
+            'error': f'Coefficient buffer ({coeff} bytes) exceeds max DRAM bank size ({max_bank_size} bytes)'
+        }
+    if input_ping > max_bank_size:
+        return {
+            'strategy': 'FAIL_INPUT_TOO_LARGE',
+            'dram0_allocation': [],
+            'dram1_allocation': [],
+            'dram0_used': 0,
+            'dram1_used': 0,
+            'dram0_size': dram0_size,
+            'dram1_size': dram1_size,
+            'dram0_free': dram0_size,
+            'dram1_free': dram1_size,
+            'dram0_fits': False,
+            'dram1_fits': False,
+            'total_fits': False,
+            'error': f'Input buffer ({input_ping} bytes) exceeds max DRAM bank size ({max_bank_size} bytes)'
+        }
+    if output_ping > max_bank_size:
+        return {
+            'strategy': 'FAIL_OUTPUT_TOO_LARGE',
+            'dram0_allocation': [],
+            'dram1_allocation': [],
+            'dram0_used': 0,
+            'dram1_used': 0,
+            'dram0_size': dram0_size,
+            'dram1_size': dram1_size,
+            'dram0_free': dram0_size,
+            'dram1_free': dram1_size,
+            'dram0_fits': False,
+            'dram1_fits': False,
+            'total_fits': False,
+            'error': f'Output buffer ({output_ping} bytes) exceeds max DRAM bank size ({max_bank_size} bytes)'
+        }
+    
+    # Strategy 1: Default placement - input/coeff in DRAM0, output/bias/outscale in DRAM1
+    strategy = "default"
+    dram0_allocation = [
+        ('input_ping', input_ping),
+        ('input_pong', input_pong),
+        ('coeff', coeff)
+    ]
+    dram1_allocation = [
+        ('output_ping', output_ping),
+        ('output_pong', output_pong),
+        ('bias', bias),
+        ('outscale', outscale)
+    ]
+    
+    dram0_used = sum(size for _, size in dram0_allocation)
+    dram1_used = sum(size for _, size in dram1_allocation)
+    dram0_fits = dram0_used <= dram0_size
+    dram1_fits = dram1_used <= dram1_size
+    
+    # Strategy 2: If DRAM0 overflows, try moving coefficient to DRAM1
+    if not dram0_fits and (dram1_used + coeff <= dram1_size):
+        strategy = "coeff_to_dram1"
+        dram0_allocation = [
+            ('input_ping', input_ping),
+            ('input_pong', input_pong)
+        ]
+        dram1_allocation = [
+            ('coeff', coeff),
+            ('output_ping', output_ping),
+            ('output_pong', output_pong),
+            ('bias', bias),
+            ('outscale', outscale)
+        ]
+        dram0_used = sum(size for _, size in dram0_allocation)
+        dram1_used = sum(size for _, size in dram1_allocation)
+        dram0_fits = dram0_used <= dram0_size
+        dram1_fits = dram1_used <= dram1_size
+    
+    # Strategy 3: If DRAM1 overflows, try moving bias/outscale to DRAM0
+    elif not dram1_fits and (dram0_used + bias + outscale <= dram0_size):
+        strategy = "bias_outscale_to_dram0"
+        dram0_allocation = [
+            ('input_ping', input_ping),
+            ('input_pong', input_pong),
+            ('coeff', coeff),
+            ('bias', bias),
+            ('outscale', outscale)
+        ]
+        dram1_allocation = [
+            ('output_ping', output_ping),
+            ('output_pong', output_pong)
+        ]
+        dram0_used = sum(size for _, size in dram0_allocation)
+        dram1_used = sum(size for _, size in dram1_allocation)
+        dram0_fits = dram0_used <= dram0_size
+        dram1_fits = dram1_used <= dram1_size
+    
+    # Strategy 4: Try combined optimization - coeff+bias+outscale to DRAM1
+    if not (dram0_fits and dram1_fits):
+        temp_dram0 = [('input_ping', input_ping), ('input_pong', input_pong)]
+        temp_dram1 = [('coeff', coeff), ('output_ping', output_ping), ('output_pong', output_pong), 
+                      ('bias', bias), ('outscale', outscale)]
+        temp_dram0_used = sum(size for _, size in temp_dram0)
+        temp_dram1_used = sum(size for _, size in temp_dram1)
+        
+        if temp_dram0_used <= dram0_size and temp_dram1_used <= dram1_size:
+            strategy = "input_only_dram0"
+            dram0_allocation = temp_dram0
+            dram1_allocation = temp_dram1
+            dram0_used = temp_dram0_used
+            dram1_used = temp_dram1_used
+            dram0_fits = True
+            dram1_fits = True
+    
+    # Strategy 5: Split input ping-pong buffers across DRAMs
+    if not (dram0_fits and dram1_fits):
+        temp_dram0 = [('input_ping', input_ping), ('coeff', coeff)]
+        temp_dram1 = [('input_pong', input_pong), ('output_ping', output_ping), ('output_pong', output_pong), 
+                      ('bias', bias), ('outscale', outscale)]
+        temp_dram0_used = sum(size for _, size in temp_dram0)
+        temp_dram1_used = sum(size for _, size in temp_dram1)
+        
+        if temp_dram0_used <= dram0_size and temp_dram1_used <= dram1_size:
+            strategy = "split_input_ping_pong"
+            dram0_allocation = temp_dram0
+            dram1_allocation = temp_dram1
+            dram0_used = temp_dram0_used
+            dram1_used = temp_dram1_used
+            dram0_fits = True
+            dram1_fits = True
+    
+    # Strategy 6: Alternative split - input_pong+coeff in DRAM0
+    if not (dram0_fits and dram1_fits):
+        temp_dram0 = [('input_pong', input_pong), ('coeff', coeff)]
+        temp_dram1 = [('input_ping', input_ping), ('output_ping', output_ping), ('output_pong', output_pong), 
+                      ('bias', bias), ('outscale', outscale)]
+        temp_dram0_used = sum(size for _, size in temp_dram0)
+        temp_dram1_used = sum(size for _, size in temp_dram1)
+        
+        if temp_dram0_used <= dram0_size and temp_dram1_used <= dram1_size:
+            strategy = "split_input_ping_pong_alt"
+            dram0_allocation = temp_dram0
+            dram1_allocation = temp_dram1
+            dram0_used = temp_dram0_used
+            dram1_used = temp_dram1_used
+            dram0_fits = True
+            dram1_fits = True
+    
+    # Build individual buffer DRAM placement mapping from allocation lists
+    # Check which DRAM each buffer is allocated to
+    dram0_buffers = {name for name, _ in dram0_allocation}
+    
+    # Map buffer names to their DRAM placement (0 or 1)
+    # Default to DRAM1 if not in DRAM0
+    in1_dram = 0 if 'input_ping' in dram0_buffers else 1
+    in2_dram = 0 if 'input_pong' in dram0_buffers else 1
+    coeff_dram = 0 if 'coeff' in dram0_buffers else 1
+    out1_dram = 0 if 'output_ping' in dram0_buffers else 1
+    out2_dram = 0 if 'output_pong' in dram0_buffers else 1
+    bias_dram = 0 if 'bias' in dram0_buffers else 1
+    outscale_dram = 0 if 'outscale' in dram0_buffers else 1
+    
+    return {
+        'strategy': strategy,
+        'dram0_allocation': dram0_allocation,
+        'dram1_allocation': dram1_allocation,
+        'dram0_used': dram0_used,
+        'dram1_used': dram1_used,
+        'dram0_size': dram0_size,
+        'dram1_size': dram1_size,
+        'dram0_free': dram0_size - dram0_used,
+        'dram1_free': dram1_size - dram1_used,
+        'dram0_fits': dram0_fits,
+        'dram1_fits': dram1_fits,
+        'total_fits': dram0_fits and dram1_fits,
+        # Individual buffer DRAM placement for generate_layer_configs.py
+        'IN1_dram': in1_dram,
+        'IN2_dram': in2_dram,
+        'COEFF_dram': coeff_dram,
+        'OUT1_dram': out1_dram,
+        'OUT2_dram': out2_dram,
+        'BIAS_dram': bias_dram,
+        'OUTSCALE_dram': outscale_dram,
+    }
+
+
+def print_buffer_placement(placement):
+    """Print buffer placement information."""
+    strategy_names = {
+        'default': 'Default: Input+Coeff->DRAM0, Output+Bias+Outscale->DRAM1',
+        'coeff_to_dram1': 'Optimized: Coefficient moved to DRAM1',
+        'bias_outscale_to_dram0': 'Optimized: Bias+Outscale moved to DRAM0',
+        'input_only_dram0': 'Optimized: Only Input ping-pong in DRAM0',
+        'split_input_ping_pong': 'Optimized: Input buffers split across DRAMs (ping in DRAM0, pong in DRAM1)',
+        'split_input_ping_pong_alt': 'Optimized: Input buffers split across DRAMs (pong in DRAM0, ping in DRAM1)'
+    }
+    
+    print("\n=== Buffer Placement ===")
+    print(f"Strategy: {strategy_names.get(placement['strategy'], placement['strategy'])}")
+    print(f"DRAM0 Size: {placement['dram0_size']:6d} bytes ({placement['dram0_size']//1024}KB)")
+    print(f"DRAM1 Size: {placement['dram1_size']:6d} bytes ({placement['dram1_size']//1024}KB)")
+    
+    print("\nDRAM0 Allocation:")
+    for name, size in placement['dram0_allocation']:
+        print(f"  {name:20s} -> {size:6d} bytes -> DRAM0")
+    print(f"  {'Total Used':20s}    {placement['dram0_used']:6d} bytes")
+    print(f"  {'Free':20s}    {placement['dram0_free']:6d} bytes")
+    print(f"  Status: {'OK FITS' if placement['dram0_fits'] else 'X OVERFLOW'}")
+    
+    print("\nDRAM1 Allocation:")
+    for name, size in placement['dram1_allocation']:
+        print(f"  {name:20s} -> {size:6d} bytes -> DRAM1")
+    print(f"  {'Total Used':20s}    {placement['dram1_used']:6d} bytes")
+    print(f"  {'Free':20s}    {placement['dram1_free']:6d} bytes")
+    print(f"  Status: {'OK FITS' if placement['dram1_fits'] else 'X OVERFLOW'}")
+    
+    print(f"\nOverall: {'OK ALL BUFFERS FIT' if placement['total_fits'] else 'X INSUFFICIENT MEMORY'}")
+
+
+def print_buffer_info(buffer_sizes):
+    """Print detailed buffer information."""
+    print("\n=== Buffer Size Calculations ===")
+    print(f"Kernel: {buffer_sizes['kernel_name']}")
+    print(f"Data Type: {buffer_sizes['data_type']}")
+    print(f"\nBuffer Sizes:")
+    print(f"  INPUT:      {buffer_sizes['IN']:6d} bytes")
+    print(f"  COEFF:      {buffer_sizes['COEFF']:6d} bytes")
+    print(f"  OUTPUT:     {buffer_sizes['OUT']:6d} bytes")
+    print(f"  BIAS:       {buffer_sizes['BIAS']:6d} bytes")
+    print(f"  OUTSCALE:   {buffer_sizes['OUTSCALE']:6d} bytes")
+    print(f"\nTotal Memory: {sum([buffer_sizes['IN'], buffer_sizes['COEFF'], buffer_sizes['OUT'], buffer_sizes['BIAS'], buffer_sizes['OUTSCALE']]):6d} bytes")
+    
+    details = buffer_sizes['details']
+    print(f"\nTile Parameters:")
+    print(f"  SRC_DIM1_SIZE:              {buffer_sizes['SRC_DIM1_SIZE']}")
+    print(f"  SRC_DIM1_PITCH:             {buffer_sizes['SRC_DIM1_PITCH']}")
+    print(f"  SRC_DIM2_PITCH:             {buffer_sizes['SRC_DIM2_PITCH']}")
+    print(f"  DST_DIM1_SIZE:              {buffer_sizes['DST_DIM1_SIZE']}")
+    print(f"  DST_DIM1_PITCH:             {buffer_sizes['DST_DIM1_PITCH']}")
+    print(f"  DST_DIM2_PITCH:             {buffer_sizes['DST_DIM2_PITCH']}")
+    print(f"  DIM1_SIZE (input width):    {buffer_sizes['IN_DIM1_SIZE']}")
+    print(f"  DIM1_PITCH (with padding):  {buffer_sizes['IN_DIM1_PITCH']}")
+    print(f"  DIM2_PITCH:                 {buffer_sizes['IN_DIM2_PITCH']}")
+    print(f"  IN_DATA_OFFSET:             {buffer_sizes['IN_DATA_OFFSET']}")
+    print(f"  OUT_DIM1_SIZE:              {buffer_sizes['OUT_DIM1_SIZE']}")
+    print(f"  OUT_DIM1_PITCH:             {buffer_sizes['OUT_DIM1_PITCH']}")
+    print(f"  OUT_DIM2_SIZE:              {buffer_sizes['OUT_DIM2_SIZE']}")
+    print(f"  OUT_DIM2_PITCH:             {buffer_sizes['OUT_DIM2_PITCH']}")
+    print(f"  OUT_DIM3_SIZE:              {buffer_sizes['OUT_DIM3_SIZE']}")
+    print(f"\nDetails:")
+    print(f"  Input buffer WHD (with padding): {details['input_buff_whd']}")
+    print(f"  Input rows needed for 2 output rows: {details['input_rows_needed']}")
+    print(f"  Output buffer WHD: {details['output_buff_whd']}")
+
+
+def calculate_conv_params(n, c, h, w, oc, wc, wh, ww, oh, ow, 
+                         stride_h, stride_w, padding_h, padding_w, 
+                         dilation_h, dilation_w, groups, 
+                         in_zero_point, weight_zero_point, 
+                         bias_scale, output_scale, output_zero_point):
+    """
+    Calculate convolution parameters including output_shift and output_scale.
+    
+    Args:
+        n, c, h, w: Input batch, channels, height, width
+        oc, wc, wh, ww: Output channels, weight channels, weight height, weight width
+        oh, ow: Output height, output width
+        stride_h, stride_w: Stride values
+        padding_h, padding_w: Padding values
+        dilation_h, dilation_w: Dilation values
+        groups: Number of groups for grouped convolution
+        in_zero_point: Input zero point
+        weight_zero_point: Weight zero point
+        bias_scale: Bias scale value
+        output_scale: Output scale value
+        output_zero_point: Output zero point
+    
+    Returns:
+        dict: Dictionary containing calculated conv_params
+    """
+    # Calculate effective scale
+    effective_scale = bias_scale / output_scale if output_scale != 0 else 0
+    
+    # Find the best output_shift so that outputScale fits in uint16_t
+    best_shift = 15
+    raw_scale = int(effective_scale * (1 << best_shift))
+    
+    if raw_scale > 65535:
+        # Scale too large for uint16_t, reduce shift until it fits
+        while best_shift > 0 and raw_scale > 65535:
+            best_shift -= 1
+            raw_scale = int(effective_scale * (1 << best_shift))
+    elif raw_scale < 16384 and best_shift < 31:
+        # Scale too small, increase shift for better precision
+        while best_shift < 31:
+            trial = int(effective_scale * (1 << (best_shift + 1)))
+            if trial > 65535:
+                break
+            best_shift += 1
+            raw_scale = trial
+    
+    # Clamp to valid uint16_t range [1, 65535]
+    if raw_scale <= 0:
+        raw_scale = 1
+    if raw_scale > 65535:
+        raw_scale = 65535
+    
+    return {
+        'strideX': stride_w,
+        'strideY': stride_h,
+        'accumShift': 0,  # No pre-shift; keep full int32 accumulator precision
+        'reluMax': 127,  # Max value for int8_t output
+        'outputShift': best_shift,
+        'outputScale': raw_scale,
+        'dilation': max(dilation_h, dilation_w),
+        'kernelHeight': wh,
+        'kernelWidth': ww
+    }
+
+
+def main():
+    """Main function with example usage."""
+    # Configuration for 7x7j2d1 convolution
+    # Input: n=1, c=3, h=224, w=224
+    # Output: oc=64, wc=3, wh=7, ww=7, oh=112, ow=112
+    conv_params_7x7 = calculate_conv_params(
+        n=1, c=3, h=224, w=224,
+        oc=64, wc=3, wh=7, ww=7,
+        oh=112, ow=112,
+        stride_h=2, stride_w=2,
+        padding_h=3, padding_w=3,
+        dilation_h=1, dilation_w=1,
+        groups=1,
+        in_zero_point=0,
+        weight_zero_point=0,
+        bias_scale=1.0,
+        output_scale=1.0,
+        output_zero_point=0
+    )
+    
+    config_7x7 = {
+        'input_whd': (224, 224, 3),
+        'output_whd': (112, 112, 64),
+        'kernel_whdn': (7, 7, 3, 64),
+        'padding': (3, 3, 3, 3, 0, 0),
+        'stride_xy': (2, 2),
+        'kernel_name': "7x7j2d1",
+        'data_type': "S8S8",
+        'conv_params': (
+            conv_params_7x7['strideX'],
+            conv_params_7x7['strideY'],
+            conv_params_7x7['accumShift'],
+            conv_params_7x7['reluMax'],
+            conv_params_7x7['outputShift'],
+            conv_params_7x7['outputScale'],
+            conv_params_7x7['dilation'],
+            conv_params_7x7['kernelHeight'],
+            conv_params_7x7['kernelWidth']
+        ),
+        'conv_flags': 0,
+    }
+    
+    # Configuration for 3x3j1d1 convolution
+    # Input: n=1, c=64, h=56, w=56
+    # Output: oc=64, wc=64, wh=3, ww=3, oh=56, ow=56
+    conv_params_3x3 = calculate_conv_params(
+        n=1, c=64, h=56, w=56,
+        oc=64, wc=64, wh=3, ww=3,
+        oh=56, ow=56,
+        stride_h=1, stride_w=1,
+        padding_h=1, padding_w=1,
+        dilation_h=1, dilation_w=1,
+        groups=1,
+        in_zero_point=0,
+        weight_zero_point=0,
+        bias_scale=1.0,
+        output_scale=1.0,
+        output_zero_point=0
+    )
+    
+    config_3x3 = {
+        'input_whd': (56, 56, 64),
+        'output_whd': (56, 56, 64),
+        'kernel_whdn': (3, 3, 64, 64),
+        'padding': (1, 1, 1, 1, 0, 0),
+        'stride_xy': (1, 1),
+        'kernel_name': "3x3j1d1",
+        'data_type': "S8S8",
+        'conv_params': (
+            conv_params_3x3['strideX'],
+            conv_params_3x3['strideY'],
+            conv_params_3x3['accumShift'],
+            conv_params_3x3['reluMax'],
+            conv_params_3x3['outputShift'],
+            conv_params_3x3['outputScale'],
+            conv_params_3x3['dilation'],
+            conv_params_3x3['kernelHeight'],
+            conv_params_3x3['kernelWidth']
+        ),
+        'conv_flags': 0,
+    }
+    
+    # Configuration for 3x3j2d1 convolution
+    # Input: n=1, c=64, h=56, w=56
+    # Output: oc=128, wc=64, wh=3, ww=3, oh=28, ow=28
+    conv_params_3x3j2d1 = calculate_conv_params(
+        n=1, c=64, h=56, w=56,
+        oc=128, wc=64, wh=3, ww=3,
+        oh=28, ow=28,
+        stride_h=2, stride_w=2,
+        padding_h=1, padding_w=1,
+        dilation_h=1, dilation_w=1,
+        groups=1,
+        in_zero_point=0,
+        weight_zero_point=0,
+        bias_scale=1.0,
+        output_scale=1.0,
+        output_zero_point=0
+    )
+    
+    config_3x3j2d1 = {
+        'input_whd': (56, 56, 64),
+        'output_whd': (28, 28, 128),
+        'kernel_whdn': (3, 3, 64, 128),
+        'padding': (1, 1, 1, 1, 0, 0),
+        'stride_xy': (2, 2),
+        'kernel_name': "3x3j2d1",
+        'data_type': "S8S8",
+        'conv_params': (
+            conv_params_3x3j2d1['strideX'],
+            conv_params_3x3j2d1['strideY'],
+            conv_params_3x3j2d1['accumShift'],
+            conv_params_3x3j2d1['reluMax'],
+            conv_params_3x3j2d1['outputShift'],
+            conv_params_3x3j2d1['outputScale'],
+            conv_params_3x3j2d1['dilation'],
+            conv_params_3x3j2d1['kernelHeight'],
+            conv_params_3x3j2d1['kernelWidth']
+        ),
+        'conv_flags': 0,
+    }
+    
+    # Configuration for 1x1j2d1 convolution
+    # Input: n=1, c=64, h=56, w=56
+    # Output: oc=128, wc=64, wh=1, ww=1, oh=28, ow=28
+    conv_params_1x1j2d1 = calculate_conv_params(
+        n=1, c=64, h=56, w=56,
+        oc=128, wc=64, wh=1, ww=1,
+        oh=28, ow=28,
+        stride_h=2, stride_w=2,
+        padding_h=0, padding_w=0,
+        dilation_h=1, dilation_w=1,
+        groups=1,
+        in_zero_point=0,
+        weight_zero_point=0,
+        bias_scale=1.0,
+        output_scale=1.0,
+        output_zero_point=0
+    )
+    
+    config_1x1j2d1 = {
+        'input_whd': (56, 56, 64),
+        'output_whd': (28, 28, 128),
+        'kernel_whdn': (1, 1, 64, 128),
+        'padding': (0, 0, 0, 0, 0, 0),
+        'stride_xy': (2, 2),
+        'kernel_name': "1x1j2d1",
+        'data_type': "S8S8",
+        'conv_params': (
+            conv_params_1x1j2d1['strideX'],
+            conv_params_1x1j2d1['strideY'],
+            conv_params_1x1j2d1['accumShift'],
+            conv_params_1x1j2d1['reluMax'],
+            conv_params_1x1j2d1['outputShift'],
+            conv_params_1x1j2d1['outputScale'],
+            conv_params_1x1j2d1['dilation'],
+            conv_params_1x1j2d1['kernelHeight'],
+            conv_params_1x1j2d1['kernelWidth']
+        ),
+        'conv_flags': 0,
+    }
+    
+    # Configuration for 1x1j1d1 convolution
+    # Input: n=1, c=512, h=28, w=28
+    # Output: oc=256, wc=512, wh=1, ww=1, oh=28, ow=28
+    conv_params_1x1j1d1 = calculate_conv_params(
+        n=1, c=512, h=28, w=28,
+        oc=256, wc=512, wh=1, ww=1,
+        oh=28, ow=28,
+        stride_h=1, stride_w=1,
+        padding_h=0, padding_w=0,
+        dilation_h=1, dilation_w=1,
+        groups=1,
+        in_zero_point=0,
+        weight_zero_point=0,
+        bias_scale=1.0,
+        output_scale=1.0,
+        output_zero_point=0
+    )
+    
+    config_1x1j1d1 = {
+        'input_whd': (28, 28, 512),
+        'output_whd': (28, 28, 256),
+        'kernel_whdn': (1, 1, 512, 256),
+        'padding': (0, 0, 0, 0, 0, 0),
+        'stride_xy': (1, 1),
+        'kernel_name': "1x1j1d1",
+        'data_type': "S8S8",
+        'conv_params': (
+            conv_params_1x1j1d1['strideX'],
+            conv_params_1x1j1d1['strideY'],
+            conv_params_1x1j1d1['accumShift'],
+            conv_params_1x1j1d1['reluMax'],
+            conv_params_1x1j1d1['outputShift'],
+            conv_params_1x1j1d1['outputScale'],
+            conv_params_1x1j1d1['dilation'],
+            conv_params_1x1j1d1['kernelHeight'],
+            conv_params_1x1j1d1['kernelWidth']
+        ),
+        'conv_flags': 0,
+    }    
+    
+    
+    # Find maximum configuration for 7x7j2d1
+    best_n_tile_7x7, best_out_rows_7x7, buffer_sizes_7x7 = find_max_tile_config(**config_7x7)
+    print_buffer_info(buffer_sizes_7x7)
+    
+    # Calculate and print buffer placement for 7x7
+    placement_7x7 = calculate_buffer_placement(buffer_sizes_7x7, dram0_size=DRAM_SIZE_0, dram1_size=DRAM_SIZE_1)
+    print_buffer_placement(placement_7x7)
+    
+    # Find maximum configuration for 3x3j1d1
+    print("\n" + "="*60 + "\n")
+    best_n_tile_3x3, best_out_rows_3x3, buffer_sizes_3x3 = find_max_tile_config(**config_3x3)
+    print_buffer_info(buffer_sizes_3x3)
+    
+    # Calculate and print buffer placement for 3x3
+    placement_3x3 = calculate_buffer_placement(buffer_sizes_3x3, dram0_size=DRAM_SIZE_0, dram1_size=DRAM_SIZE_1)
+    print_buffer_placement(placement_3x3)
+    
+    # Find maximum configuration for 3x3j2d1
+    print("\n" + "="*60 + "\n")
+    best_n_tile_3x3j2d1, best_out_rows_3x3j2d1, buffer_sizes_3x3j2d1 = find_max_tile_config(**config_3x3j2d1)
+    print_buffer_info(buffer_sizes_3x3j2d1)
+    
+    # Calculate and print buffer placement for 3x3j2d1
+    placement_3x3j2d1 = calculate_buffer_placement(buffer_sizes_3x3j2d1, dram0_size=DRAM_SIZE_0, dram1_size=DRAM_SIZE_1)
+    print_buffer_placement(placement_3x3j2d1)
+    
+    # Find maximum configuration for 1x1j2d1
+    print("\n" + "="*60 + "\n")
+    best_n_tile_1x1j2d1, best_out_rows_1x1j2d1, buffer_sizes_1x1j2d1 = find_max_tile_config(**config_1x1j2d1)
+    print_buffer_info(buffer_sizes_1x1j2d1)
+    
+    # Calculate and print buffer placement for 1x1j2d1
+    placement_1x1j2d1 = calculate_buffer_placement(buffer_sizes_1x1j2d1, dram0_size=DRAM_SIZE_0, dram1_size=DRAM_SIZE_1)
+    print_buffer_placement(placement_1x1j2d1)
+    
+    # Find maximum configuration for 1x1j1d1
+    print("\n" + "="*60 + "\n")
+    best_n_tile_1x1j1d1, best_out_rows_1x1j1d1, buffer_sizes_1x1j1d1 = find_max_tile_config(**config_1x1j1d1)
+    print_buffer_info(buffer_sizes_1x1j1d1)
+    
+    # Calculate and print buffer placement for 1x1j1d1
+    placement_1x1j1d1 = calculate_buffer_placement(buffer_sizes_1x1j1d1, dram0_size=DRAM_SIZE_0, dram1_size=DRAM_SIZE_1)
+    print_buffer_placement(placement_1x1j1d1)
+    
+    # Generate combined header content
+    header_content = generate_combined_header([buffer_sizes_7x7, buffer_sizes_3x3, buffer_sizes_3x3j2d1, buffer_sizes_1x1j2d1, buffer_sizes_1x1j1d1])
+    print("\n=== Generated Header Content ===")
+    print(header_content)
+    
+    # Write to file
+    output_file = r"C:\usr\xtensa\Xplorer-11.1.5-workspaces\xicnn1\test_cnn_depthwise_convolve_MOD2\test\convIdma_buffers.h"
+    with open(output_file, 'w') as f:
+        f.write(header_content)
+    print(f"\nHeader file written to: {output_file}")
+
+
+def generate_combined_header(buffer_sizes_list, header_guard="CONVIDMA_BUFFERS_H_", dram0_size=None, dram1_size=None):
+    """
+    Generate C header file content with buffer size definitions for multiple kernels.
+    
+    Args:
+        buffer_sizes_list: List of dictionaries from calculate_buffer_sizes()
+        header_guard: Header guard name
+        dram0_size: Size of DRAM0 in bytes (default: use global DRAM_SIZE_0)
+        dram1_size: Size of DRAM1 in bytes (default: use global DRAM_SIZE_1)
+    
+    Returns:
+        String containing header file content
+    """
+    # Use global DRAM sizes if not specified
+    if dram0_size is None:
+        dram0_size = DRAM_SIZE_0
+    if dram1_size is None:
+        dram1_size = DRAM_SIZE_1
+    
+    header = f"""/*
+ * convIdma_buffers.h
+ *
+ *  Auto-generated buffer size definitions
+ */
+
+#ifndef {header_guard}
+#define {header_guard}
+
+
+// ============================================================================
+// Avilable DRAM Sizes for IDMA Buffers
+// ============================================================================
+
+#define IDMA_BUFFER_SIZE_DRAM0 ({dram0_size}) // {dram0_size // 1024} KB for DRAM0
+#define IDMA_BUFFER_SIZE_DRAM1 ({dram1_size}) // {dram1_size // 1024} KB for DRAM1
+
+"""
+    
+    # Calculate placements for all kernels
+    placements = [calculate_buffer_placement(bs, dram0_size=dram0_size, dram1_size=dram1_size) 
+                  for bs in buffer_sizes_list]
+    
+    # Generate content for each kernel configuration
+    for buffer_sizes, placement in zip(buffer_sizes_list, placements):
+        kernel_name = buffer_sizes['kernel_name']
+        data_type = buffer_sizes['data_type']
+        
+        # Extract padding values
+        dim1_edge1, dim1_edge2, dim2_edge1, dim2_edge2, dim3_edge1, dim3_edge2 = (
+            buffer_sizes.get('padding', (0, 0, 0, 0, 0, 0))
+        )
+        
+        header += f"""// ============================================================================
+// IDMA Buffer Sizes and Tile Parameters for convVQ3D_{kernel_name}_{data_type}_MOW_WHD
+// ============================================================================
+
+// SRC tile parameters
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM1_SIZE     {buffer_sizes['SRC_DIM1_SIZE']} // input width
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM1_PITCH    {buffer_sizes['SRC_DIM1_PITCH']} //
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM2_SIZE     {buffer_sizes['SRC_DIM2_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM2_PITCH    {buffer_sizes['SRC_DIM2_PITCH']} // {buffer_sizes['SRC_DIM1_SIZE']}*{buffer_sizes['SRC_DIM2_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM3_SIZE     {buffer_sizes['SRC_DIM3_SIZE']}
+
+// DST   tile parameters
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM1_SIZE     {buffer_sizes['DST_DIM1_SIZE']} // input width
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM1_PITCH    {buffer_sizes['DST_DIM1_PITCH']} //
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM2_SIZE     {buffer_sizes['DST_DIM2_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM2_PITCH    {buffer_sizes['DST_DIM2_PITCH']} // {buffer_sizes['DST_DIM1_SIZE']}*{buffer_sizes['DST_DIM2_SIZE']}
+
+
+// Input tile parameters
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM1_SIZE     {buffer_sizes['IN_DIM1_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM1_PITCH    {buffer_sizes['IN_DIM1_PITCH']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM2_SIZE     {buffer_sizes['IN_DIM2_SIZE']} // IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_SIZE + ((IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_SIZE-1)* stride)
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM2_PITCH    {buffer_sizes['IN_DIM2_PITCH']}
+
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM1_EDGE1 {dim1_edge1}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM1_EDGE2 {dim1_edge2}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM2_EDGE1 {dim2_edge1}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM2_EDGE2 {dim2_edge2}  
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM3_EDGE1 {dim3_edge1}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM3_EDGE2 {dim3_edge2}
+
+
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_DATA_OFFSET   {buffer_sizes['IN_DATA_OFFSET']}
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_ROWS_FIRSTDMA {buffer_sizes['IN_ROWS_FIRSTDMA']}  // IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM2_SIZE - padding rows 
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_FRAME_PTR 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_STATUS_FLAGS 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_DIM1_COORD 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_DIM3_COORD 0
+
+// Output tile parameters
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_SIZE     {buffer_sizes['OUT_DIM1_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_PITCH    {buffer_sizes['OUT_DIM1_PITCH']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_SIZE     {buffer_sizes['OUT_DIM2_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_PITCH    {buffer_sizes['OUT_DIM2_PITCH']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM3_SIZE     {buffer_sizes['N_TILE_SIZE']} //IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_FRAME_PTR 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_STATUS_FLAGS 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_COORD 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_EDGE1 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_EDGE2 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_EDGE1 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_EDGE2 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM3_COORD 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM3_EDGE1 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM3_EDGE2 0
+
+//coefficient tile parameters
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_SIZE     {buffer_sizes['COEFF_DIM1_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_SIZE     {buffer_sizes['COEFF_DIM2_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_SIZE     {buffer_sizes['COEFF_DIM3_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM4_SIZE     {buffer_sizes['COEFF_DIM4_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_PITCH    {buffer_sizes['COEFF_DIM1_PITCH']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_PITCH    {buffer_sizes['COEFF_DIM2_PITCH']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_PITCH    {buffer_sizes['COEFF_DIM3_PITCH']}
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_FRAME_PTR 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_STATUS_FLAGS 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_COORD 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_EDGE1 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_EDGE2 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_COORD 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_EDGE1 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_EDGE2 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_COORD 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_EDGE1 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_EDGE2 0
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM4_COORD 0
+
+//bias array parameters
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_BIAS_DIM1_SIZE       {buffer_sizes['BIAS_DIM1_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_BIAS_DIM2_SIZE       {buffer_sizes['BIAS_DIM2_SIZE']}       
+
+//output scale array parameters
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUTSCALE_DIM1_SIZE     {buffer_sizes['OUTSCALE_DIM1_SIZE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUTSCALE_DIM2_SIZE     {buffer_sizes['OUTSCALE_DIM2_SIZE']}
+
+// Buffer sizes
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN1       {buffer_sizes['IN']}
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN2       {buffer_sizes['IN']}
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF     {buffer_sizes['COEFF']}
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT1     {buffer_sizes['OUT']}
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT2     {buffer_sizes['OUT']}
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_BIAS       {buffer_sizes['BIAS']}
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUTSCALE   {buffer_sizes['OUTSCALE']}
+
+"""
+        
+        # Generate DRAM placement macros based on optimization strategy
+        dram_map = {}
+        for name, size in placement['dram0_allocation']:
+            dram_map[name] = 0
+        for name, size in placement['dram1_allocation']:
+            dram_map[name] = 1
+        
+        # Map buffer names to macro names
+        header += f"""#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN1_DRAM     {dram_map.get('input_ping', 0)}  
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN2_DRAM     {dram_map.get('input_pong', 0)} 
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DRAM   {dram_map.get('coeff', 0)} 
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT1_DRAM     {dram_map.get('output_ping', 1)}
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT2_DRAM     {dram_map.get('output_pong', 1)}
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_BIAS_DRAM     {dram_map.get('bias', 1)}
+#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUTSCALE_DRAM {dram_map.get('outscale', 1)}
+
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILES {buffer_sizes['N_TILES']}           // round_toward positive(IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM3_SIZE / IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE)
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_HIGHT_TILES {buffer_sizes['HIGHT_TILES']}     //IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM2_SIZE / IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_SIZE   
+#define  IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE {buffer_sizes['N_TILE_SIZE']}    // take this as input aas of now (contstant 22 for 3x3 conv and constant 64 for 7x7 conv)
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE_LAST {buffer_sizes['N_TILE_SIZE_LAST']} //IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM4_SIZE - IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_TILE_SIZE_LAST {buffer_sizes['COEFF_TILE_SIZE_LAST']}  //  IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_SIZE * IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_SIZE * IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_SIZE * IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE_LAST
+"""
+        
+        # Add convolution parameters if available
+        if 'STRIDEX' in buffer_sizes:
+            header += f"""
+// Convolution parameters
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_STRIDEX {buffer_sizes['STRIDEX']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_STRIDEY {buffer_sizes['STRIDEY']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_ACCUM_SHIFT {buffer_sizes['ACCUM_SHIFT']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_RELU_MAX {buffer_sizes['RELU_MAX']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_RELU_MIN {buffer_sizes['RELU_MIN']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUTPUT_SHIFT {buffer_sizes['OUTPUT_SHIFT']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUTPUT_SCALE {buffer_sizes['OUTPUT_SCALE']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DILATION {buffer_sizes['DILATION']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_KERNEL_HEIGHT {buffer_sizes['KERNEL_HEIGHT']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_KERNEL_WIDTH {buffer_sizes['KERNEL_WIDTH']}
+#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_FLAGS {buffer_sizes['FLAGS']}
+"""
+        
+        header += """
+
+ 
+"""
+    
+    header += f"#endif /* {header_guard} */\n"
+    return header
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backends/cadence/vision/config_generator/generate_layer_configs.py b/backends/cadence/vision/config_generator/generate_layer_configs.py
new file mode 100644
index 00000000000..65459653ec5
--- /dev/null
+++ b/backends/cadence/vision/config_generator/generate_layer_configs.py
@@ -0,0 +1,1158 @@
+#!/usr/bin/env python3
+"""
+Generate buffer configuration lookup table from layer configurations
+
+This script extracts conv2d layers directly from PyTorch models (or reads
+from .csv/.json) and:
+1. Extracts unique conv2d layer parameters via forward hooks
+2. Calculates optimal buffer sizes and tiling for each layer
+3. Generates a C lookup table with all configurations
+4. Outputs conv_layer_configs.h for runtime use
+
+Usage:
+    # Direct from model (no CSV needed):
+    python generate_layer_configs.py --model resnet18 --output conv_layer_configs.h --dram0 64000 --dram1 64000
+    python generate_layer_configs.py --model resnet50 --output conv_layer_configs.h --dram0 64000 --dram1 64000
+    python generate_layer_configs.py --model resnet18+resnet50 --output conv_layer_configs.h --dram0 64000 --dram1 64000
+
+    # From existing CSV
+    python generate_layer_configs.py resnet_conv_list.csv --output conv_layer_configs.h --dram0 64000 --dram1 64000
+
+    # From .pte extraction JSON
+    python generate_layer_configs.py layers_config.json --dram0 32768 --dram1 32768
+
+    # Generate all configs in no-DMA mode (changes _dma suffix to _no_dma for every kernel name)
+    python generate_layer_configs.py resnet_conv_list.csv --output conv_layer_configs_no_dma.h --dram0 64000 --dram1 64000 --no-dma-mode
+"""
+
+import os
+import sys
+import json
+import argparse
+from pathlib import Path
+from collections import OrderedDict
+
+# Import the existing buffer calculation logic
+sys.path.insert(0, str(Path(__file__).parent))
+from generate_idma_buffers import (
+    find_max_tile_config,
+    calculate_buffer_sizes_with_rows,
+    calculate_buffer_placement,
+    DRAM_SIZE_0,
+    DRAM_SIZE_1
+)
+
+# ---------------------------------------------------------------------------
+# Direct model extraction (replaces extract_resnet_layers.py)
+# ---------------------------------------------------------------------------
+
+SUPPORTED_MODELS = ['resnet18', 'resnet50']
+
+
+def _build_name_map_resnet18():
+    """ResNet-18 (BasicBlock): 2 conv layers per block, 2 blocks per layer group."""
+    m = OrderedDict()
+    m['conv1'] = 'conv1'
+    m['layer1.0.conv1'] = 'conv2.1'
+    m['layer1.0.conv2'] = 'conv2.2'
+    m['layer1.1.conv1'] = 'conv3.1'
+    m['layer1.1.conv2'] = 'conv3.2'
+    m['layer2.0.downsample.0'] = 'conv4a.1'
+    m['layer2.0.conv1'] = 'conv4b.1'
+    m['layer2.0.conv2'] = 'conv4b.2'
+    m['layer2.1.conv1'] = 'conv5.1'
+    m['layer2.1.conv2'] = 'conv5.2'
+    m['layer3.0.downsample.0'] = 'conv6a.1'
+    m['layer3.0.conv1'] = 'conv6b.1'
+    m['layer3.0.conv2'] = 'conv6b.2'
+    m['layer3.1.conv1'] = 'conv7.1'
+    m['layer3.1.conv2'] = 'conv7.2'
+    m['layer4.0.downsample.0'] = 'conv8a.1'
+    m['layer4.0.conv1'] = 'conv8b.1'
+    m['layer4.0.conv2'] = 'conv8b.2'
+    m['layer4.1.conv1'] = 'conv9.1'
+    m['layer4.1.conv2'] = 'conv9.2'
+    return m
+
+
+def _build_name_map_resnet50():
+    """ResNet-50 (Bottleneck): 3 conv layers per block, variable blocks per layer group."""
+    m = OrderedDict()
+    m['conv1'] = 'conv1'
+    layer_blocks = {1: 3, 2: 4, 3: 6, 4: 3}
+    conv_counter = 2
+    for layer_idx in range(1, 5):
+        n_blocks = layer_blocks[layer_idx]
+        for blk in range(n_blocks):
+            prefix = f'layer{layer_idx}.{blk}'
+            has_ds = (blk == 0)
+            if has_ds:
+                m[f'{prefix}.downsample.0'] = f'conv{conv_counter}a.1'
+                m[f'{prefix}.conv1'] = f'conv{conv_counter}b.1'
+                m[f'{prefix}.conv2'] = f'conv{conv_counter}b.2'
+                m[f'{prefix}.conv3'] = f'conv{conv_counter}b.3'
+            else:
+                m[f'{prefix}.conv1'] = f'conv{conv_counter}.1'
+                m[f'{prefix}.conv2'] = f'conv{conv_counter}.2'
+                m[f'{prefix}.conv3'] = f'conv{conv_counter}.3'
+            conv_counter += 1
+    return m
+
+
+def _get_conv_layers_via_hooks(model, name_map, input_size=(1, 3, 64, 64)):
+    """
+    Run forward hooks on every Conv2d layer to capture input/output shapes
+    and convolution parameters.  Returns OrderedDict keyed by friendly name.
+    """
+    import torch
+    import torch.nn as nn
+
+    layers_info = OrderedDict()
+    hooks = []
+
+    def make_hook(friendly_name):
+        def hook_fn(module, inp, out):
+            layers_info[friendly_name] = {
+                'input': list(inp[0].shape),
+                'kernel': list(module.weight.shape),
+                'stride': list(module.stride),
+                'padding': list(module.padding),
+                'dilation': list(module.dilation),
+                'transposed': isinstance(module, nn.ConvTranspose2d),
+                'output_padding': (list(module.output_padding)
+                                   if hasattr(module, 'output_padding') else [0, 0]),
+                'groups': module.groups,
+                'output': list(out.shape),
+            }
+        return hook_fn
+
+    for mod_name, module in model.named_modules():
+        if mod_name in name_map and isinstance(module, (nn.Conv2d, nn.ConvTranspose2d)):
+            hooks.append(module.register_forward_hook(make_hook(name_map[mod_name])))
+
+    x = torch.randn(*input_size)
+    with torch.no_grad():
+        model.eval()
+        model(x)
+
+    for h in hooks:
+        h.remove()
+    return layers_info
+
+
+def _make_unique_key(info):
+    """Hashable key for deduplication across models."""
+    return (
+        tuple(info['input']),
+        tuple(info['kernel']),
+        tuple(info['stride']),
+        tuple(info['padding']),
+        tuple(info['dilation']),
+        info['transposed'],
+        tuple(info['output_padding']),
+        info['groups'],
+        tuple(info['output']),
+    )
+
+
+def load_layers_from_model(model_names, input_size=(1, 3, 64, 64)):
+    """
+    Extract unique conv2d layers directly from one or more torchvision models.
+
+    Args:
+        model_names: list of model name strings, e.g. ['resnet18', 'resnet50']
+        input_size:  tuple for the dummy forward pass, e.g. (1, 3, 64, 64)
+
+    Returns:
+        list of layer dicts in the internal format expected by calculate_layer_config()
+    """
+    import torch  # noqa: deferred import so torch is only needed when --model is used
+
+    builders = {
+        'resnet18': ('torchvision.models', 'resnet18', 'ResNet18_Weights', _build_name_map_resnet18),
+        'resnet50': ('torchvision.models', 'resnet50', 'ResNet50_Weights', _build_name_map_resnet50),
+    }
+
+    seen_keys = set()
+    unique_layers = []  # (friendly_name, info_dict, source_model)
+
+    for model_name in model_names:
+        model_name = model_name.strip().lower()
+        if model_name not in builders:
+            raise ValueError(f"Unsupported model '{model_name}'. Supported: {list(builders.keys())}")
+
+        mod_path, fn_name, wt_name, name_map_fn = builders[model_name]
+        print(f"Loading {model_name}...")
+        import importlib
+        tv = importlib.import_module(mod_path)
+        build_fn = getattr(tv, fn_name)
+        weights = getattr(tv, wt_name).DEFAULT
+        model = build_fn(weights=weights)
+        model.eval()
+
+        name_map = name_map_fn()
+        layers_info = _get_conv_layers_via_hooks(model, name_map, input_size)
+
+        for name, info in layers_info.items():
+            key = _make_unique_key(info)
+            if key not in seen_keys:
+                seen_keys.add(key)
+                unique_layers.append((name, info, model_name))
+
+    print(f"Extracted {len(unique_layers)} unique conv layers from {', '.join(model_names)}")
+
+    # Convert to the internal layer-dict format used by calculate_layer_config()
+    layers = []
+    for layer_id, (name, info, _source) in enumerate(unique_layers):
+        _, in_c, in_h, in_w = info['input']
+        _, out_c, out_h, out_w = info['output']
+        _oc, in_channels, k_h, k_w = info['kernel']
+        layers.append({
+            'layer_id': layer_id,
+            'name': name,
+            'input': (in_w, in_h, in_c),
+            'output': (out_w, out_h, out_c),
+            'kernel': (k_w, k_h, in_channels, _oc),
+            'stride': tuple(info['stride']),
+            'padding': tuple(info['padding']),
+            'dilation': tuple(info['dilation']),
+        })
+    return layers
+
+
+# ---------------------------------------------------------------------------
+# PTE-based loader (ExecuTorch .pte binary via exir source tree)
+# ---------------------------------------------------------------------------
+
+# Default paths relative to this script's location
+# backends/cadence/vision/config_generator/ → .parent×5 → ext_test/executorch
+_EXECUTORCH_SRC = str(Path(__file__).parent.parent.parent.parent.parent)  # ext_test/executorch
+_EXECUTORCH_PARENT = str(Path(__file__).parent.parent.parent.parent.parent.parent)  # ext_test
+_FLATC_DEFAULT = str(Path(__file__).parent.parent.parent.parent.parent /
+                     "cmake-out/third-party/flatc_ep/bin/flatc")
+
+
+def _bootstrap_executorch_imports(flatc_path=None):
+    """
+    Bootstrap executorch.exir from the local source tree without a pip install.
+
+    Bypasses exir/__init__.py (which pulls in many optional deps) by pre-populating
+    sys.modules with lightweight stub packages for 'executorch' and 'executorch.exir'.
+    Only the _serialize sub-package is actually loaded.
+
+    Also sets FLATC_EXECUTABLE so _flatbuffer.py can find the flatc binary.
+    """
+    import types
+
+    # Add ext_test/ so `import executorch…` works, and ext_test/executorch/ so
+    # internal sub-imports like `from executorch.exir._serialize…` resolve correctly.
+    if _EXECUTORCH_PARENT not in sys.path:
+        sys.path.insert(0, _EXECUTORCH_PARENT)
+    if _EXECUTORCH_SRC not in sys.path:
+        sys.path.insert(0, _EXECUTORCH_SRC)
+
+    # Stub 'executorch' and 'executorch.exir' so Python never runs their
+    # __init__.py files (which have heavy, optional dependencies).
+    for pkg, pkg_dir in [
+        ('executorch',       _EXECUTORCH_SRC),
+        ('executorch.exir',  _EXECUTORCH_SRC + '/exir'),
+    ]:
+        if pkg not in sys.modules:
+            m = types.ModuleType(pkg)
+            m.__path__ = [pkg_dir]
+            m.__package__ = pkg
+            sys.modules[pkg] = m
+
+    # Tell _flatbuffer.py where to find the flatc binary.
+    resolved = flatc_path or _FLATC_DEFAULT
+    if os.path.isfile(resolved):
+        os.environ.setdefault('FLATC_EXECUTABLE', resolved)
+
+
+def load_layers_from_pte(pte_file, flatc_path=None):
+    """
+    Extract unique conv2d layers directly from an ExecuTorch .pte binary.
+
+    Mirrors load_layers_from_model() but reads the serialised execution plan
+    instead of running a live forward pass.  Works without a full executorch
+    pip install by loading the _serialize sub-package from the local source
+    tree.
+
+    Args:
+        pte_file:   Path to the .pte file (str or Path).
+        flatc_path: Optional path to the flatc binary.  Defaults to the
+                    cmake-out copy built alongside the source tree.
+
+    Returns:
+        list of layer dicts in the internal format expected by
+        calculate_layer_config(), same as load_layers_from_model().
+    """
+    _bootstrap_executorch_imports(flatc_path)
+
+    from executorch.exir._serialize._program import deserialize_pte_binary
+    from executorch.exir.schema import KernelCall, Int, IntList, Tensor
+
+    pte_path = Path(pte_file)
+    print(f"Loading PTE: {pte_path} ...")
+
+    with open(pte_path, 'rb') as f:
+        pte_file_obj = deserialize_pte_binary(f.read())
+
+    # deserialize_pte_binary returns a PTEFile wrapper; unwrap to get Program
+    if hasattr(pte_file_obj, 'program'):
+        program = pte_file_obj.program
+    else:
+        program = pte_file_obj  # older API returned Program directly
+
+    plan   = program.execution_plan[0]
+    values = plan.values
+
+    # ------------------------------------------------------------------
+    # Helpers to dereference EValue indices from the values table
+    # ------------------------------------------------------------------
+    def _tensor(idx):
+        v = values[idx].val
+        return v if isinstance(v, Tensor) else None
+
+    def _int_val(idx):
+        v = values[idx].val
+        return v.int_val if isinstance(v, Int) else None
+
+    def _intlist_val(idx):
+        """IntList.items are EValue indices pointing to Int EVals."""
+        v = values[idx].val
+        if isinstance(v, IntList):
+            return [_int_val(i) for i in v.items]
+        return None
+
+    # ------------------------------------------------------------------
+    # Walk all KernelCall instructions and collect conv layers
+    # ------------------------------------------------------------------
+    # cadence::quantized_conv2d_nchw arg order (from quantized_conv2d_nchw_out.cpp):
+    #   [0] input  [1] weight  [2] bias
+    #   [3] stride  [4] padding  [5] dilation
+    #   [6] groups  [7] in_zero_point  … [−2/−1] out
+    CONV_OPS = {
+        'cadence::quantized_conv2d_nchw',
+        'aten::conv2d',
+        'aten::convolution',
+    }
+
+    seen_keys = set()
+    unique_layers = []
+
+    for instr in plan.chains[0].instructions:
+        ia = instr.instr_args
+        if not isinstance(ia, KernelCall):
+            continue
+        op_name = plan.operators[ia.op_index].name
+        if op_name not in CONV_OPS:
+            continue
+
+        args = ia.args
+        input_t  = _tensor(args[0])
+        weight_t = _tensor(args[1])
+        output_t = _tensor(args[-1])   # last arg is always the output tensor
+
+        if input_t is None or weight_t is None or output_t is None:
+            continue
+
+        stride   = _intlist_val(args[3]) or [1, 1]
+        padding  = _intlist_val(args[4]) or [0, 0]
+        dilation = _intlist_val(args[5]) or [1, 1]
+
+        # shapes are NCHW
+        _, in_c,  in_h,  in_w  = input_t.sizes
+        _, out_c, out_h, out_w = output_t.sizes
+        _oc, _ic, k_h, k_w    = weight_t.sizes
+
+        info = {
+            'input':   (in_w,  in_h,  in_c),
+            'output':  (out_w, out_h, out_c),
+            'kernel':  (k_w,   k_h,   _ic,   _oc),
+            'stride':  tuple(stride),
+            'padding': tuple(padding),
+            'dilation':tuple(dilation),
+        }
+
+        key = (info['input'], info['output'], info['kernel'],
+               info['stride'], info['padding'], info['dilation'])
+        if key not in seen_keys:
+            seen_keys.add(key)
+            unique_layers.append(info)
+
+    print(f"Extracted {len(unique_layers)} unique conv layers from PTE")
+
+    # Convert to the internal layer-dict format (same as load_layers_from_model)
+    layers = []
+    for layer_id, info in enumerate(unique_layers):
+        in_w,  in_h,  in_c  = info['input']
+        out_w, out_h, out_c = info['output']
+        k_w,   k_h,   _ic,  _oc = info['kernel']
+        # Derive a friendly name from the kernel shape
+        name = f"conv_{k_h}x{k_w}_s{info['stride'][0]}_ic{in_c}_oc{out_c}"
+        layers.append({
+            'layer_id':  layer_id,
+            'name':      name,
+            'input':     info['input'],
+            'output':    info['output'],
+            'kernel':    info['kernel'],
+            'stride':    info['stride'],
+            'padding':   info['padding'],
+            'dilation':  info['dilation'],
+        })
+    return layers
+
+
+# ---------------------------------------------------------------------------
+# File-based loaders (CSV / JSON)
+# ---------------------------------------------------------------------------
+
+def load_layers_from_json(json_file):
+    """Load layer configurations from JSON file"""
+    with open(json_file, 'r') as f:
+        return json.load(f)
+
+def load_layers_from_csv(csv_file):
+    """Load layer configurations from ResNet CSV file (tab-delimited)"""
+    import csv
+    
+    layers = []
+    with open(csv_file, 'r') as f:
+        reader = csv.reader(f, delimiter='\t')
+        layer_id = 0
+        
+        for row in reader:
+            # Skip header or empty rows
+            if not row or not row[0].strip() or 'input' in row[0].lower() or (len(row) > 1 and 'input' in row[1].lower()):
+                continue
+            
+            # Tab-delimited format: layer_name \t input \t kernel \t stride \t padding \t dilation \t transposed \t output_padding \t groups \t output
+            layer_name = row[0].strip()
+            
+            # Parse shapes from CSV
+            input_shape = tuple(int(x) for x in row[1].strip().split(','))   # e.g., "1,3,64,64"
+            kernel_shape = tuple(int(x) for x in row[2].strip().split(','))  # e.g., "64,3,7,7"
+            stride = tuple(int(x) for x in row[3].strip().split(','))        # e.g., "2, 2"
+            padding = tuple(int(x) for x in row[4].strip().split(',')) if len(row) > 4 else (0, 0)
+            output_shape = tuple(int(x) for x in row[9].strip().split(','))  # e.g., "1,64,32,32"
+            
+            # Convert to internal format
+            _, in_c, in_h, in_w = input_shape
+            _, out_c, out_h, out_w = output_shape
+            out_channels, in_channels, k_h, k_w = kernel_shape
+            
+            layer = {
+                'layer_id': layer_id,
+                'name': layer_name,
+                'input': (in_w, in_h, in_c),
+                'output': (out_w, out_h, out_c),
+                'kernel': (k_w, k_h, in_channels, out_channels),
+                'stride': tuple(stride),
+                'padding': tuple(padding),
+                'dilation': (1, 1)
+            }
+            
+            layers.append(layer)
+            layer_id += 1
+    
+    return layers
+
+def calculate_layer_config(layer, dram0_size, dram1_size):
+    """
+    Calculate complete buffer configuration for a single layer
+    
+    Returns: Dictionary with all runtime parameters
+    """
+    # Unpack layer parameters
+    input_w, input_h, input_c = layer['input']
+    output_w, output_h, output_c = layer['output']
+    kernel_w, kernel_h, in_c, out_c = layer['kernel']
+    stride_w, stride_h = layer['stride']
+    pad_w, pad_h = layer['padding']
+    
+    # Calculate padding edges
+    padding = (pad_w, pad_w, pad_h, pad_h, 0, 0)  # (dim1_e1, dim1_e2, dim2_e1, dim2_e2, ...)
+    
+    # Dummy conv_params (will be set per-model)
+    conv_params = (stride_w, stride_h, 8, 4000, 11, 0, 1, kernel_h, kernel_w)
+    
+    # Generate kernel name based on size and stride
+    if kernel_h == 7 and kernel_w == 7 and stride_h == 2:
+        kernel_name = "7x7j2d1"
+    elif kernel_h == 3 and kernel_w == 3 and stride_h == 1:
+        kernel_name = "3x3j1d1"
+    elif kernel_h == 3 and kernel_w == 3 and stride_h == 2:
+        kernel_name = "3x3j2d1"
+    elif kernel_h == 1 and kernel_w == 1 and stride_h == 2:
+        kernel_name = "1x1j2d1"
+    elif kernel_h == 1 and kernel_w == 1 and stride_h == 1:
+        kernel_name = "1x1j1d1"
+    else:
+        kernel_name = f"{kernel_w}x{kernel_h}j{stride_w}d1"
+    
+    # Find optimal tiling configuration
+    n_tile_size, output_rows, buffer_sizes = find_max_tile_config(
+        input_whd=(input_w, input_h, input_c),
+        output_whd=(output_w, output_h, output_c),
+        kernel_whdn=(kernel_w, kernel_h, in_c, out_c),
+        padding=padding,
+        stride_xy=(stride_w, stride_h),
+        kernel_name=kernel_name,
+        data_type="S8S8",
+        dram0_size=dram0_size,
+        dram1_size=dram1_size,
+        conv_params=conv_params
+    )
+    
+    if buffer_sizes is None or n_tile_size == 0 or output_rows == 0:
+        print(f"WARNING: Could not find valid DMA configuration for layer {layer['layer_id']} - using cache mode (single tile)")
+        
+        # Calculate pitches with padding for cache mode
+        # in_dim1_size = src_dim1_size (actual input width)
+        # in_dim1_pitch = input_w + 2*pad_w (width including padding)
+        in_dim1_pitch = input_w + 2 * pad_w
+        in_dim2_pitch = in_dim1_pitch * (input_h + 2 * pad_h)
+        out_dim1_pitch = output_w
+        out_dim2_pitch = out_dim1_pitch * output_h
+        coeff_dim1_pitch = kernel_w
+        coeff_dim2_pitch = coeff_dim1_pitch * kernel_h
+        coeff_dim3_pitch = coeff_dim2_pitch * in_c
+        
+        # Calculate buffer sizes for full tile (no tiling - process entire layer)
+        input_buffer_size = in_dim2_pitch * input_c
+        output_buffer_size = out_dim2_pitch * output_c
+        coeff_buffer_size = coeff_dim3_pitch * output_c
+        bias_buffer_size = output_c * 4  # S32
+        outscale_buffer_size = output_c * 2  # U16
+        
+        # Data offset is 0 for cache mode (no pre-allocated padding in buffer)
+        in_data_offset = 0
+        
+        # Return cache-mode config: single tile processing entire layer
+        return {
+            'layer_id': layer['layer_id'],
+            'layer_name': layer['name'],
+            'kernel_name': kernel_name + "_no_dma",
+            'src_dim1_size': input_w, 'src_dim2_size': input_h, 'src_dim3_size': input_c,
+            'src_dim1_pitch': input_w, 'src_dim2_pitch': input_w * input_h,
+            'dst_dim1_size': output_w, 'dst_dim2_size': output_h, 'dst_dim3_size': output_c,
+            'dst_dim1_pitch': output_w, 'dst_dim2_pitch': output_w * output_h,
+            'in_dim1_size': input_w, 'in_dim1_pitch': in_dim1_pitch,
+            'in_dim2_size': input_h, 'in_dim2_pitch': in_dim2_pitch,
+            'in_dim1_edge1': pad_w, 'in_dim1_edge2': pad_w, 'in_dim2_edge1': pad_h, 'in_dim2_edge2': pad_h,
+            'in_dim3_edge1': 0, 'in_dim3_edge2': 0, 'in_data_offset': in_data_offset, 'in_rows_firstdma': input_h,
+            'out_dim1_size': output_w, 'out_dim1_pitch': out_dim1_pitch,
+            'out_dim2_size': output_h, 'out_dim2_pitch': out_dim2_pitch, 'out_dim3_size': output_c,
+            'coeff_dim1_size': kernel_w, 'coeff_dim2_size': kernel_h, 'coeff_dim3_size': in_c, 'coeff_dim4_size': output_c,
+            'coeff_dim1_pitch': coeff_dim1_pitch, 'coeff_dim2_pitch': coeff_dim2_pitch, 'coeff_dim3_pitch': coeff_dim3_pitch,
+            'bias_dim1_size': output_c, 'bias_dim2_size': 1,
+            'outscale_dim1_size': output_c, 'outscale_dim2_size': 1,
+            'input_buffer_size': input_buffer_size, 'coeff_buffer_size': coeff_buffer_size,
+            'output_buffer_size': output_buffer_size,
+            'bias_buffer_size': bias_buffer_size, 'outscale_buffer_size': outscale_buffer_size,
+            'input_ping_dram': 0, 'input_pong_dram': 0, 'coeff_dram': 0,
+            'output_ping_dram': 0, 'output_pong_dram': 0, 'bias_dram': 0, 'outscale_dram': 0,
+            'n_tile_size': output_c, 'n_tiles': 1, 'n_tile_size_last': output_c, 'height_tiles': 1,
+            'output_rows': output_h, 'input_rows': input_h,
+            'stride_x': stride_w, 'stride_y': stride_h, 'accum_shift': 8, 'relu_max': 4000,
+            'relu_min': 0, 'output_shift': 11, 'output_scale': 0, 'dilation': 1,
+            'kernel_w': kernel_w, 'kernel_h': kernel_h, 'padding': pad_w, 'flags': 0,
+            'input_zero_point': 0,
+            # Generate unique config key: ic_ih_iw_oc_kh_kw_oh_ow_sy_sx_pad_dil
+            'config_key': f"{input_c}_{input_h}_{input_w}_{output_c}_{kernel_h}_{kernel_w}_{output_h}_{output_w}_{stride_h}_{stride_w}_{pad_w}_1",
+        }
+    
+    # Calculate additional derived parameters
+    n_tiles = (out_c + n_tile_size - 1) // n_tile_size
+    height_tiles = (output_h + output_rows - 1) // output_rows
+    input_rows = kernel_h + (output_rows - 1) * stride_h
+    
+    # Get buffer placement
+    placement = calculate_buffer_placement(buffer_sizes, dram0_size, dram1_size)
+    
+    # Build complete config with all fields from convIdma_buffers.h schema
+    config = {
+        'layer_id': layer['layer_id'],
+        'layer_name': layer['name'],
+        'kernel_name': kernel_name + "_dma",
+        
+        # Source dimensions
+        'src_dim1_size': buffer_sizes['SRC_DIM1_SIZE'],
+        'src_dim2_size': buffer_sizes['SRC_DIM2_SIZE'],
+        'src_dim3_size': buffer_sizes['SRC_DIM3_SIZE'],
+        'src_dim1_pitch': buffer_sizes['SRC_DIM1_PITCH'],
+        'src_dim2_pitch': buffer_sizes['SRC_DIM2_PITCH'],
+        
+        # Destination dimensions
+        'dst_dim1_size': buffer_sizes['DST_DIM1_SIZE'],
+        'dst_dim2_size': buffer_sizes['DST_DIM2_SIZE'],
+        'dst_dim1_pitch': buffer_sizes['DST_DIM1_PITCH'],
+        'dst_dim2_pitch': buffer_sizes['DST_DIM2_PITCH'],
+        'dst_dim3_size': output_c,
+        
+        # Input tile dimensions
+        'in_dim1_size': buffer_sizes['IN_DIM1_SIZE'],
+        'in_dim1_pitch': buffer_sizes['IN_DIM1_PITCH'],
+        'in_dim2_size': buffer_sizes['IN_DIM2_SIZE'],
+        'in_dim2_pitch': buffer_sizes['IN_DIM2_PITCH'],
+        'in_dim1_edge1': padding[0],
+        'in_dim1_edge2': padding[1],
+        'in_dim2_edge1': padding[2],
+        'in_dim2_edge2': padding[3],
+        'in_dim3_edge1': padding[4],
+        'in_dim3_edge2': padding[5],
+        'in_data_offset': buffer_sizes['IN_DATA_OFFSET'],
+        'in_rows_firstdma': buffer_sizes['IN_ROWS_FIRSTDMA'],
+        
+        # Output tile dimensions
+        'out_dim1_size': buffer_sizes['OUT_DIM1_SIZE'],
+        'out_dim1_pitch': buffer_sizes['OUT_DIM1_PITCH'],
+        'out_dim2_size': buffer_sizes['OUT_DIM2_SIZE'],
+        'out_dim2_pitch': buffer_sizes['OUT_DIM2_PITCH'],
+        'out_dim3_size': buffer_sizes['OUT_DIM3_SIZE'],
+        
+        # Coefficient tile dimensions
+        'coeff_dim1_size': buffer_sizes['COEFF_DIM1_SIZE'],
+        'coeff_dim2_size': buffer_sizes['COEFF_DIM2_SIZE'],
+        'coeff_dim3_size': buffer_sizes['COEFF_DIM3_SIZE'],
+        'coeff_dim4_size': buffer_sizes['COEFF_DIM4_SIZE'],
+        'coeff_dim1_pitch': buffer_sizes['COEFF_DIM1_PITCH'],
+        'coeff_dim2_pitch': buffer_sizes['COEFF_DIM2_PITCH'],
+        'coeff_dim3_pitch': buffer_sizes['COEFF_DIM3_PITCH'],
+        
+        # Bias dimensions
+        'bias_dim1_size': buffer_sizes['BIAS_DIM1_SIZE'],
+        'bias_dim2_size': buffer_sizes['BIAS_DIM2_SIZE'],
+        
+        # Output scale dimensions
+        'outscale_dim1_size': buffer_sizes['OUTSCALE_DIM1_SIZE'],
+        'outscale_dim2_size': buffer_sizes['OUTSCALE_DIM2_SIZE'],
+        
+        # Buffer sizes
+        'input_buffer_size': buffer_sizes['IN'],
+        'coeff_buffer_size': buffer_sizes['COEFF'],
+        'output_buffer_size': buffer_sizes['OUT'],
+        'bias_buffer_size': buffer_sizes['BIAS'],
+        'outscale_buffer_size': buffer_sizes['OUTSCALE'],
+        
+        # Buffer DRAM placement (0 or 1)
+        'input_ping_dram': placement.get('IN1_dram', 0),
+        'input_pong_dram': placement.get('IN2_dram', 1),
+        'coeff_dram': placement.get('COEFF_dram', 0),
+        'output_ping_dram': placement.get('OUT1_dram', 1),
+        'output_pong_dram': placement.get('OUT2_dram', 1),
+        'bias_dram': placement.get('BIAS_dram', 1),
+        'outscale_dram': placement.get('OUTSCALE_dram', 1),
+        
+        # Tiling parameters
+        'n_tile_size': buffer_sizes['N_TILE_SIZE'],
+        'n_tiles': buffer_sizes['N_TILES'],
+        'n_tile_size_last': buffer_sizes['N_TILE_SIZE_LAST'],
+        'height_tiles': buffer_sizes['HIGHT_TILES'],
+        'output_rows': output_rows,
+        'input_rows': input_rows,
+        
+        # Convolution parameters
+        'stride_x': buffer_sizes.get('STRIDEX', stride_w),
+        'stride_y': buffer_sizes.get('STRIDEY', stride_h),
+        'accum_shift': buffer_sizes.get('ACCUM_SHIFT', 8),
+        'relu_max': buffer_sizes.get('RELU_MAX', 4000),
+        'relu_min': buffer_sizes.get('RELU_MIN', 0),
+        'output_shift': buffer_sizes.get('OUTPUT_SHIFT', 11),
+        'output_scale': buffer_sizes.get('OUTPUT_SCALE', 0),
+        'dilation': buffer_sizes.get('DILATION', 1),
+        'kernel_w': kernel_w,
+        'kernel_h': kernel_h,
+        'padding': pad_w,  # Symmetric padding
+        'flags': buffer_sizes.get('FLAGS', 0),
+        'input_zero_point': 0,
+    }
+    
+    # Generate unique config key based on layer parameters
+    # Format: ic_ih_iw_oc_kh_kw_oh_ow_sy_sx_pad_dil
+    dilation = buffer_sizes.get('DILATION', 1)
+    config['config_key'] = f"{in_c}_{input_h}_{input_w}_{out_c}_{kernel_h}_{kernel_w}_{output_h}_{output_w}_{stride_h}_{stride_w}_{pad_w}_{dilation}"
+    
+    return config
+
+def generate_c_header(configs, output_file, dram0_size=32768, dram1_size=32768, no_dma_mode=False):
+    """
+    Generate C header file with lookup table
+    
+    Output: conv_layer_configs.h with:
+    - typedef struct conv_layer_config_t
+    - const conv_layer_config_t CONV_LAYER_CONFIGS[] = {...};
+    - int get_num_conv_layers();
+    - const conv_layer_config_t* get_layer_config(int layer_id);
+    """
+    
+    with open(output_file, 'w') as f:
+        f.write("""/*
+ * conv_layer_configs.h
+ *
+ * Auto-generated convolution layer configurations
+ * Generated from model layer extraction
+ *
+ * DO NOT EDIT MANUALLY - Regenerate with generate_layer_configs.py
+ */
+
+#ifndef CONV_LAYER_CONFIGS_H
+#define CONV_LAYER_CONFIGS_H
+
+#include <stdint.h>
+#include <stddef.h>  // for NULL
+
+/**
+ * Runtime configuration for a single convolution layer
+ * Contains all parameters needed to execute the layer
+ * Matches convIdma_buffers.h schema
+ */
+typedef struct {
+    // Layer identification
+    int layer_id;
+    const char* layer_name;
+    const char* kernel_name;
+    const char* config_key;     // Unique key: ic_ih_iw_oc_kh_kw_oh_ow_sy_sx_pad_dil
+    
+    // Source (DRAM) dimensions
+    int src_dim1_size;      // Input width in DRAM
+    int src_dim2_size;      // Input height in DRAM
+    int src_dim3_size;      // Input channels in DRAM
+    int src_dim1_pitch;     // DRAM row pitch
+    int src_dim2_pitch;     // DRAM plane pitch
+    
+    // Destination (DRAM) dimensions
+    int dst_dim1_size;      // Output width in DRAM
+    int dst_dim2_size;      // Output height in DRAM
+    int dst_dim3_size;      // Output channels in DRAM
+    int dst_dim1_pitch;     // DRAM row pitch
+    int dst_dim2_pitch;     // DRAM plane pitch
+    
+    // Input tile (local memory) dimensions
+    int in_dim1_size;       // Tile width (with padding)
+    int in_dim1_pitch;      // Tile row pitch
+    int in_dim2_size;       // Tile height (rows per iteration)
+    int in_dim2_pitch;      // Tile plane pitch
+    int in_dim1_edge1;      // Left padding
+    int in_dim1_edge2;      // Right padding
+    int in_dim2_edge1;      // Top padding
+    int in_dim2_edge2;      // Bottom padding
+    int in_dim3_edge1;      // Channel padding (usually 0)
+    int in_dim3_edge2;      // Channel padding (usually 0)
+    int in_data_offset;     // Offset to actual data in buffer
+    int in_rows_firstdma;   // Rows to transfer in first DMA
+    
+    // Output tile (local memory) dimensions
+    int out_dim1_size;      // Output width
+    int out_dim1_pitch;     // Output row pitch
+    int out_dim2_size;      // Output rows per iteration
+    int out_dim2_pitch;     // Output plane pitch
+    int out_dim3_size;      // Output channels per N-tile
+    
+    // Coefficient tile dimensions
+    int coeff_dim1_size;    // Kernel width
+    int coeff_dim2_size;    // Kernel height
+    int coeff_dim3_size;    // Input channels
+    int coeff_dim4_size;    // Output channels (total)
+    int coeff_dim1_pitch;   // Kernel row pitch
+    int coeff_dim2_pitch;   // Kernel plane pitch (W*H)
+    int coeff_dim3_pitch;   // Kernel 3D pitch (W*H*D)
+    
+    // Bias array dimensions
+    int bias_dim1_size;     // Number of bias values
+    int bias_dim2_size;     // Always 1
+    
+    // Output scale array dimensions
+    int outscale_dim1_size; // Number of scale values
+    int outscale_dim2_size; // Always 1
+    
+    // Buffer sizes (bytes)
+    int input_buffer_size;
+    int coeff_buffer_size;
+    int output_buffer_size;
+    int bias_buffer_size;
+    int outscale_buffer_size;
+    
+    // Buffer DRAM placement (0 = DRAM0, 1 = DRAM1)
+    int input_ping_dram;
+    int input_pong_dram;
+    int coeff_dram;
+    int output_ping_dram;
+    int output_pong_dram;
+    int bias_dram;
+    int outscale_dram;
+    
+    // Tiling parameters
+    int n_tile_size;        // Output channels per N-tile
+    int n_tiles;            // Total number of N-tiles
+    int n_tile_size_last;   // Channels in last N-tile
+    int height_tiles;       // Total number of H-tiles
+    int output_rows;        // Output rows per H-tile
+    int input_rows;         // Input rows needed per H-tile
+    
+    // Convolution parameters
+    int kernel_w;
+    int kernel_h;
+    int stride_x;
+    int stride_y;
+    int padding;            // Symmetric padding
+    int dilation;
+    int accum_shift;        // Accumulator shift
+    int relu_max;           // ReLU clamp maximum
+    int relu_min;           // ReLU clamp minimum
+    int output_shift;       // Output quantization shift
+    int output_scale;       // Output scale factor
+    int flags;              // Convolution flags
+    int input_zero_point;   // Input zero point for padding fill
+    
+} conv_layer_config_t;
+
+""")
+        
+        # Generate lookup table
+        f.write(f"// Total number of convolution layers\n")
+        f.write(f"#define NUM_CONV_LAYERS {len(configs)}\n\n")
+        
+        # Generate IDMA buffer size macros
+        _dram0_macro = 0 if no_dma_mode else dram0_size
+        _dram1_macro = 0 if no_dma_mode else dram1_size
+        f.write(f" #define IDMA_BUFFER_SIZE_DRAM0 ({_dram0_macro}) // {_dram0_macro // 1024} KB for DRAM0\n")
+        f.write(f" #define IDMA_BUFFER_SIZE_DRAM1 ({_dram1_macro}) // {_dram1_macro // 1024} KB for DRAM1\n\n")
+        
+        f.write("// Layer configuration lookup table\n")
+        f.write("static const conv_layer_config_t CONV_LAYER_CONFIGS[] = {\n")
+        
+        for config in configs:
+            f.write("    {\n")
+            f.write(f"        .layer_id = {config['layer_id']},\n")
+            f.write(f"        .layer_name = \"{config['layer_name']}\",\n")
+            f.write(f"        .kernel_name = \"{config['kernel_name']}\",\n")
+            f.write(f"        .config_key = \"{config['config_key']}\",\n")
+            f.write(f"        \n")
+            
+            # Source dimensions
+            f.write(f"        // Source (DRAM): {config['src_dim1_size']}×{config['src_dim2_size']}×{config['src_dim3_size']}\n")
+            f.write(f"        .src_dim1_size = {config['src_dim1_size']},\n")
+            f.write(f"        .src_dim2_size = {config['src_dim2_size']},\n")
+            f.write(f"        .src_dim3_size = {config['src_dim3_size']},\n")
+            f.write(f"        .src_dim1_pitch = {config['src_dim1_pitch']},\n")
+            f.write(f"        .src_dim2_pitch = {config['src_dim2_pitch']},\n")
+            f.write(f"        \n")
+            
+            # Destination dimensions
+            f.write(f"        // Destination (DRAM): {config['dst_dim1_size']}×{config['dst_dim2_size']}×{config['dst_dim3_size']}\n")
+            f.write(f"        .dst_dim1_size = {config['dst_dim1_size']},\n")
+            f.write(f"        .dst_dim2_size = {config['dst_dim2_size']},\n")
+            f.write(f"        .dst_dim3_size = {config['dst_dim3_size']},\n")
+            f.write(f"        .dst_dim1_pitch = {config['dst_dim1_pitch']},\n")
+            f.write(f"        .dst_dim2_pitch = {config['dst_dim2_pitch']},\n")
+            f.write(f"        \n")
+            
+            # Input tile dimensions
+            f.write(f"        // Input tile: {config['in_dim1_size']}×{config['in_dim2_size']} (edges: {config['in_dim1_edge1']},{config['in_dim1_edge2']},{config['in_dim2_edge1']},{config['in_dim2_edge2']})\n")
+            f.write(f"        .in_dim1_size = {config['in_dim1_size']},\n")
+            f.write(f"        .in_dim1_pitch = {config['in_dim1_pitch']},\n")
+            f.write(f"        .in_dim2_size = {config['in_dim2_size']},\n")
+            f.write(f"        .in_dim2_pitch = {config['in_dim2_pitch']},\n")
+            f.write(f"        .in_dim1_edge1 = {config['in_dim1_edge1']},\n")
+            f.write(f"        .in_dim1_edge2 = {config['in_dim1_edge2']},\n")
+            f.write(f"        .in_dim2_edge1 = {config['in_dim2_edge1']},\n")
+            f.write(f"        .in_dim2_edge2 = {config['in_dim2_edge2']},\n")
+            f.write(f"        .in_dim3_edge1 = {config['in_dim3_edge1']},\n")
+            f.write(f"        .in_dim3_edge2 = {config['in_dim3_edge2']},\n")
+            f.write(f"        .in_data_offset = {config['in_data_offset']},\n")
+            f.write(f"        .in_rows_firstdma = {config['in_rows_firstdma']},\n")
+            f.write(f"        \n")
+            
+            # Output tile dimensions
+            f.write(f"        // Output tile: {config['out_dim1_size']}×{config['out_dim2_size']}×{config['out_dim3_size']}\n")
+            f.write(f"        .out_dim1_size = {config['out_dim1_size']},\n")
+            f.write(f"        .out_dim1_pitch = {config['out_dim1_pitch']},\n")
+            f.write(f"        .out_dim2_size = {config['out_dim2_size']},\n")
+            f.write(f"        .out_dim2_pitch = {config['out_dim2_pitch']},\n")
+            f.write(f"        .out_dim3_size = {config['out_dim3_size']},\n")
+            f.write(f"        \n")
+            
+            # Coefficient dimensions
+            f.write(f"        // Coefficients: {config['coeff_dim1_size']}×{config['coeff_dim2_size']}×{config['coeff_dim3_size']}×{config['coeff_dim4_size']}\n")
+            f.write(f"        .coeff_dim1_size = {config['coeff_dim1_size']},\n")
+            f.write(f"        .coeff_dim2_size = {config['coeff_dim2_size']},\n")
+            f.write(f"        .coeff_dim3_size = {config['coeff_dim3_size']},\n")
+            f.write(f"        .coeff_dim4_size = {config['coeff_dim4_size']},\n")
+            f.write(f"        .coeff_dim1_pitch = {config['coeff_dim1_pitch']},\n")
+            f.write(f"        .coeff_dim2_pitch = {config['coeff_dim2_pitch']},\n")
+            f.write(f"        .coeff_dim3_pitch = {config['coeff_dim3_pitch']},\n")
+            f.write(f"        \n")
+            
+            # Bias and outscale
+            f.write(f"        // Bias/Outscale: {config['bias_dim1_size']}\n")
+            f.write(f"        .bias_dim1_size = {config['bias_dim1_size']},\n")
+            f.write(f"        .bias_dim2_size = {config['bias_dim2_size']},\n")
+            f.write(f"        .outscale_dim1_size = {config['outscale_dim1_size']},\n")
+            f.write(f"        .outscale_dim2_size = {config['outscale_dim2_size']},\n")
+            f.write(f"        \n")
+            
+            # Buffer sizes
+            f.write(f"        // Buffer sizes (bytes)\n")
+            f.write(f"        .input_buffer_size = {config['input_buffer_size']},\n")
+            f.write(f"        .coeff_buffer_size = {config['coeff_buffer_size']},\n")
+            f.write(f"        .output_buffer_size = {config['output_buffer_size']},\n")
+            f.write(f"        .bias_buffer_size = {config['bias_buffer_size']},\n")
+            f.write(f"        .outscale_buffer_size = {config['outscale_buffer_size']},\n")
+            f.write(f"        \n")
+            
+            # DRAM placement
+            f.write(f"        // DRAM placement\n")
+            f.write(f"        .input_ping_dram = {config['input_ping_dram']},\n")
+            f.write(f"        .input_pong_dram = {config['input_pong_dram']},\n")
+            f.write(f"        .coeff_dram = {config['coeff_dram']},\n")
+            f.write(f"        .output_ping_dram = {config['output_ping_dram']},\n")
+            f.write(f"        .output_pong_dram = {config['output_pong_dram']},\n")
+            f.write(f"        .bias_dram = {config['bias_dram']},\n")
+            f.write(f"        .outscale_dram = {config['outscale_dram']},\n")
+            f.write(f"        \n")
+            
+            # Tiling parameters
+            f.write(f"        // Tiling: {config['n_tile_size']} ch/tile × {config['n_tiles']} tiles, {config['output_rows']} rows/tile × {config['height_tiles']} tiles\n")
+            f.write(f"        .n_tile_size = {config['n_tile_size']},\n")
+            f.write(f"        .n_tiles = {config['n_tiles']},\n")
+            f.write(f"        .n_tile_size_last = {config['n_tile_size_last']},\n")
+            f.write(f"        .height_tiles = {config['height_tiles']},\n")
+            f.write(f"        .output_rows = {config['output_rows']},\n")
+            f.write(f"        .input_rows = {config['input_rows']},\n")
+            f.write(f"        \n")
+            
+            # Convolution parameters
+            f.write(f"        // Conv params: {config['kernel_w']}×{config['kernel_h']}, stride {config['stride_x']}×{config['stride_y']}, pad {config['padding']}\n")
+            f.write(f"        .kernel_w = {config['kernel_w']},\n")
+            f.write(f"        .kernel_h = {config['kernel_h']},\n")
+            f.write(f"        .stride_x = {config['stride_x']},\n")
+            f.write(f"        .stride_y = {config['stride_y']},\n")
+            f.write(f"        .padding = {config['padding']},\n")
+            f.write(f"        .dilation = {config['dilation']},\n")
+            f.write(f"        .accum_shift = {config['accum_shift']},\n")
+            f.write(f"        .relu_max = {config['relu_max']},\n")
+            f.write(f"        .relu_min = {config['relu_min']},\n")
+            f.write(f"        .output_shift = {config['output_shift']},\n")
+            f.write(f"        .output_scale = {config['output_scale']},\n")
+            f.write(f"        .flags = {config['flags']},\n")
+            f.write(f"        .input_zero_point = {config['input_zero_point']},\n")
+            f.write("    },\n")
+        
+        f.write("};\n\n")
+        
+        # Generate accessor functions
+        f.write("""
+/**
+ * Get total number of convolution layers
+ */
+static inline int get_num_conv_layers(void) {
+    return NUM_CONV_LAYERS;
+}
+
+/**
+ * Get configuration for a specific layer by layer_id
+ * 
+ * @param layer_id Layer index (0 to NUM_CONV_LAYERS-1)
+ * @return Pointer to configuration, or NULL if invalid layer_id
+ */
+static inline const conv_layer_config_t* get_layer_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) {
+        return NULL;
+    }
+    return &CONV_LAYER_CONFIGS[layer_id];
+}
+
+/**
+ * Get configuration for a layer by its parameters
+ * Searches for a layer matching the given convolution parameters
+ *
+ * @param ic   Input channels
+ * @param ih   Input height
+ * @param iw   Input width
+ * @param oc   Output channels
+ * @param kh   Kernel height
+ * @param kw   Kernel width
+ * @param oh   Output height
+ * @param ow   Output width
+ * @param sy   Stride Y
+ * @param sx   Stride X
+ * @param pad  Padding (symmetric)
+ * @param dil  Dilation
+ * @return Pointer to configuration, or NULL if not found
+ */
+static inline const conv_layer_config_t* get_layer_config_by_params(
+    int ic, int ih, int iw,
+    int oc, int kh, int kw,
+    int oh, int ow,
+    int sy, int sx,
+    int pad, int dil) {
+    
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->src_dim3_size == ic &&
+            cfg->src_dim2_size == ih &&
+            cfg->src_dim1_size == iw &&
+            cfg->dst_dim3_size == oc &&
+            cfg->coeff_dim2_size == kh &&
+            cfg->coeff_dim1_size == kw &&
+            cfg->dst_dim2_size == oh &&
+            cfg->dst_dim1_size == ow &&
+            cfg->stride_y == sy &&
+            cfg->stride_x == sx &&
+            cfg->padding == pad &&
+            cfg->dilation == dil) {
+            return cfg;
+        }
+    }
+    return NULL;
+}
+
+/**
+ * Get configuration for a layer by config key string
+ * Key format: "ic_ih_iw_oc_kh_kw_oh_ow_sy_sx_pad_dil"
+ *
+ * @param config_key The unique configuration key string
+ * @return Pointer to configuration, or NULL if not found
+ */
+static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) {
+    if (config_key == NULL) return NULL;
+    
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        // Simple string comparison
+        const char* a = cfg->config_key;
+        const char* b = config_key;
+        int match = 1;
+        while (*a && *b) {
+            if (*a++ != *b++) { match = 0; break; }
+        }
+        if (match && *a == *b) return cfg;
+    }
+    return NULL;
+}
+
+#endif // CONV_LAYER_CONFIGS_H
+""")
+    
+    print(f"Generated {output_file}")
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate convolution layer configuration lookup table',
+        epilog='One of --model, --pte, or a positional input_file (csv/json) is required.'
+    )
+    parser.add_argument('input_file', nargs='?', default=None,
+                       help='Input file (layers_config.json or resnet_conv_list.csv). '
+                            'Not needed when using --model or --pte.')
+    parser.add_argument('--model', '-m', default=None,
+                       help='Extract layers directly from PyTorch model(s). '
+                            'Comma or + separated list. '
+                            f'Supported: {", ".join(SUPPORTED_MODELS)}. '
+                            'Example: --model resnet18+resnet50')
+    parser.add_argument('--pte', nargs='+', default=None,
+                       help='Extract layers from one or more ExecuTorch .pte binaries. '
+                            'Example: --pte resnet18.pte resnet50.pte')
+    parser.add_argument('--flatc', default=None,
+                       help='Path to flatc binary (default: cmake-out/third-party/flatc_ep/bin/flatc)')
+    parser.add_argument('--input-size', default='1,3,64,64',
+                       help='Model input tensor shape as N,C,H,W (default: 1,3,64,64)')
+    parser.add_argument('--output', '-o', default='conv_layer_configs.h',
+                       help='Output C header file (default: conv_layer_configs.h)')
+    parser.add_argument('--dram0', type=int, default=DRAM_SIZE_0,
+                       help=f'DRAM0 size in bytes (default: {DRAM_SIZE_0})')
+    parser.add_argument('--dram1', type=int, default=DRAM_SIZE_1,
+                       help=f'DRAM1 size in bytes (default: {DRAM_SIZE_1})')
+    parser.add_argument('--no-dma-mode', action='store_true', default=False,
+                       help='Force all configs to no-DMA mode: changes _dma suffix to _no_dma for every kernel name')
+    
+    args = parser.parse_args()
+    
+    # ---- Load layers: --model, --pte, or input_file ----
+    if args.pte:
+        all_layers = []
+        seen_keys = set()
+        for pte_arg in args.pte:
+            pte_path = Path(pte_arg)
+            if not pte_path.exists():
+                print(f"ERROR: PTE file not found: {pte_path}")
+                return 1
+            print(f"Extracting layers from PTE: {pte_path}")
+            pte_layers = load_layers_from_pte(pte_path, flatc_path=args.flatc)
+            for l in pte_layers:
+                key = (l['input'], l['output'], l['kernel'],
+                       l['stride'], l['padding'], l['dilation'])
+                if key not in seen_keys:
+                    seen_keys.add(key)
+                    l['layer_id'] = len(all_layers)
+                    all_layers.append(l)
+                else:
+                    print(f"  [skip duplicate] {l['name']}")
+        layers = all_layers
+        print(f"Total unique layers from {len(args.pte)} PTE file(s): {len(layers)}")
+    elif args.model:
+        # Parse model names (accept comma or + as separator)
+        model_names = [n.strip() for n in args.model.replace('+', ',').split(',') if n.strip()]
+        input_size = tuple(int(x) for x in args.input_size.split(','))
+        print(f"Extracting layers from model(s): {', '.join(model_names)}  input_size={input_size}")
+        layers = load_layers_from_model(model_names, input_size)
+    elif args.input_file:
+        input_path = Path(args.input_file)
+        if not input_path.exists():
+            print(f"ERROR: Input file not found: {input_path}")
+            return 1
+        print(f"Loading layers from {input_path}...")
+        if input_path.suffix == '.json':
+            layers = load_layers_from_json(input_path)
+        elif input_path.suffix == '.csv':
+            layers = load_layers_from_csv(input_path)
+        else:
+            print(f"ERROR: Unsupported file type: {input_path.suffix}")
+            print("Supported: .json, .csv")
+            return 1
+    else:
+        parser.error('One of --model, --pte, or a positional input_file is required.')
+        return 1
+    
+    print(f"Loaded {len(layers)} layers")
+    
+    # Calculate configurations for all layers
+    print(f"\nCalculating buffer configurations (DRAM0={args.dram0}, DRAM1={args.dram1})...")
+    configs = []
+    for layer in layers:
+        print(f"  Processing layer {layer['layer_id']}: {layer['name']}...")
+        config = calculate_layer_config(layer, args.dram0, args.dram1)
+        if config:
+            configs.append(config)
+            print(f"    [OK] n_tile={config['n_tile_size']}, n_tiles={config['n_tiles']}, "
+                  f"output_rows={config['output_rows']}, height_tiles={config['height_tiles']}")
+        else:
+            print(f"    ✗ Failed to calculate configuration")
+    
+    if len(configs) == 0:
+        print("ERROR: No valid configurations generated")
+        return 1
+    
+    print(f"\nGenerated {len(configs)} valid configurations")
+    
+    # Apply no-DMA mode: change _dma suffix to _no_dma for every kernel name
+    if args.no_dma_mode:
+        for config in configs:
+            if config['kernel_name'].endswith('_dma'):
+                config['kernel_name'] = config['kernel_name'][:-4] + '_no_dma'
+        print(f"No-DMA mode enabled: all kernel names suffixed with _no_dma")
+    
+    # Generate C header
+    generate_c_header(configs, args.output, args.dram0, args.dram1, args.no_dma_mode)
+    
+    print(f"\nSuccess! Generated {args.output}")
+    print(f"Use in C code:")
+    print(f"  #include \"{args.output}\"")
+    print(f"  const conv_layer_config_t* config = get_layer_config(0);")
+    print(f"  conv_execute_layer(0, input, output, weights, bias, outscale);")
+    
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/backends/cadence/vision/config_generator/layer_configs_16k.h b/backends/cadence/vision/config_generator/layer_configs_16k.h
new file mode 100644
index 00000000000..13df04f97c5
--- /dev/null
+++ b/backends/cadence/vision/config_generator/layer_configs_16k.h
@@ -0,0 +1,2403 @@
+/*
+ * layer_configs.h
+ *
+ * Auto-generated conv2d + maxpool layer configurations
+ * Generated from PTE extraction by generate_combined_configs.py
+ *
+ * DO NOT EDIT MANUALLY
+ */
+
+#ifndef LAYER_CONFIGS_H
+#define LAYER_CONFIGS_H
+
+#include <stdint.h>
+#include <stddef.h>  /* for NULL */
+
+#define IDMA_BUFFER_SIZE_DRAM0 (16384)  /* 16 KB */
+#define IDMA_BUFFER_SIZE_DRAM1 (16384)  /* 16 KB */
+
+/* ====================================================================== */
+/*  Conv2d configurations                                              */
+/* ====================================================================== */
+
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* kernel_name;
+    const char* config_key;
+
+    int src_dim1_size;  int src_dim2_size;  int src_dim3_size;
+    int src_dim1_pitch; int src_dim2_pitch;
+
+    int dst_dim1_size;  int dst_dim2_size;  int dst_dim3_size;
+    int dst_dim1_pitch; int dst_dim2_pitch;
+
+    int in_dim1_size;   int in_dim1_pitch;
+    int in_dim2_size;   int in_dim2_pitch;
+    int in_dim1_edge1;  int in_dim1_edge2;
+    int in_dim2_edge1;  int in_dim2_edge2;
+    int in_dim3_edge1;  int in_dim3_edge2;
+    int in_data_offset; int in_rows_firstdma;
+
+    int out_dim1_size;  int out_dim1_pitch;
+    int out_dim2_size;  int out_dim2_pitch;
+    int out_dim3_size;
+
+    int coeff_dim1_size;  int coeff_dim2_size;
+    int coeff_dim3_size;  int coeff_dim4_size;
+    int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch;
+
+    int bias_dim1_size;     int bias_dim2_size;
+    int outscale_dim1_size; int outscale_dim2_size;
+
+    int input_buffer_size;  int coeff_buffer_size;  int output_buffer_size;
+    int bias_buffer_size;   int outscale_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;  int coeff_dram;
+    int output_ping_dram; int output_pong_dram;
+    int bias_dram;        int outscale_dram;
+
+    int n_tile_size; int n_tiles; int n_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int kernel_w; int kernel_h;
+    int stride_x; int stride_y;
+    int padding;  int dilation;
+    int accum_shift; int relu_max; int relu_min;
+    int output_shift; int output_scale; int flags;
+    int input_zero_point;
+} conv_layer_config_t;
+
+#define NUM_CONV_LAYERS 29
+
+static const conv_layer_config_t CONV_LAYER_CONFIGS[] = {
+    {
+        .layer_id = 0,
+        .layer_name = "conv_7x7_s2_ic3_oc64",
+        .kernel_name = "7x7j2d1_dma",
+        .config_key = "3_64_64_64_7_7_32_32_2_2_3_1",
+        .src_dim1_size = 64,
+        .src_dim2_size = 64,
+        .src_dim3_size = 3,
+        .src_dim1_pitch = 64,
+        .src_dim2_pitch = 4096,
+        .dst_dim1_size = 32,
+        .dst_dim2_size = 32,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 32,
+        .dst_dim2_pitch = 1024,
+        .in_dim1_size = 64,
+        .in_dim1_pitch = 70,
+        .in_dim2_size = 13,
+        .in_dim2_pitch = 910,
+        .in_dim1_edge1 = 3,
+        .in_dim1_edge2 = 3,
+        .in_dim2_edge1 = 3,
+        .in_dim2_edge2 = 3,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 213,
+        .in_rows_firstdma = 10,
+        .out_dim1_size = 32,
+        .out_dim1_pitch = 32,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 128,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 7,
+        .coeff_dim2_size = 7,
+        .coeff_dim3_size = 3,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 7,
+        .coeff_dim2_pitch = 49,
+        .coeff_dim3_pitch = 147,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2730,
+        .coeff_buffer_size = 9408,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 8,
+        .output_rows = 4,
+        .input_rows = 13,
+        .kernel_w = 7,
+        .kernel_h = 7,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 3,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 1,
+        .layer_name = "conv_3x3_s1_ic64_oc64",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "64_16_16_64_3_3_16_16_1_1_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 72,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 32,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 576,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 4608,
+        .coeff_buffer_size = 9216,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 4,
+        .n_tile_size_last = 16,
+        .height_tiles = 8,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 2,
+        .layer_name = "conv_1x1_s2_ic64_oc128",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "64_16_16_128_1_1_8_8_2_2_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 7,
+        .in_dim2_pitch = 112,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 7,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 32,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 7168,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 4096,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 1,
+        .n_tile_size_last = 128,
+        .height_tiles = 2,
+        .output_rows = 4,
+        .input_rows = 7,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 3,
+        .layer_name = "conv_3x3_s2_ic64_oc128",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "64_16_16_128_3_3_8_8_2_2_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 90,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 576,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 5760,
+        .coeff_buffer_size = 9216,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 8,
+        .n_tile_size_last = 16,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 4,
+        .layer_name = "conv_3x3_s1_ic128_oc128",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "128_8_8_128_3_3_8_8_1_1_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 40,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 5120,
+        .coeff_buffer_size = 9216,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 16,
+        .n_tile_size_last = 8,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 5,
+        .layer_name = "conv_1x1_s2_ic128_oc256",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "128_8_8_256_1_1_4_4_2_2_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 24,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 128,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 3072,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 4,
+        .n_tile_size_last = 64,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 6,
+        .layer_name = "conv_3x3_s2_ic128_oc256",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "128_8_8_256_3_3_4_4_2_2_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 50,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 6400,
+        .coeff_buffer_size = 9216,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 32,
+        .n_tile_size_last = 8,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 7,
+        .layer_name = "conv_3x3_s1_ic256_oc256",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "256_4_4_256_3_3_4_4_1_1_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 24,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 7,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 4,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 6144,
+        .coeff_buffer_size = 9216,
+        .output_buffer_size = 32,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 4,
+        .n_tiles = 64,
+        .n_tile_size_last = 4,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 8,
+        .layer_name = "conv_1x1_s2_ic256_oc512",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "256_4_4_512_1_1_2_2_2_2_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 12,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 3072,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 16,
+        .n_tile_size_last = 32,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 9,
+        .layer_name = "conv_3x3_s2_ic256_oc512",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "256_4_4_512_3_3_2_2_2_2_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 30,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 7,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 4,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 7680,
+        .coeff_buffer_size = 9216,
+        .output_buffer_size = 16,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 4,
+        .n_tiles = 128,
+        .n_tile_size_last = 4,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 10,
+        .layer_name = "conv_3x3_s1_ic512_oc512",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "512_2_2_512_3_3_2_2_1_1_1_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 5,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 2,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 4608,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 9216,
+        .output_buffer_size = 8,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 2,
+        .n_tiles = 256,
+        .n_tile_size_last = 2,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 11,
+        .layer_name = "conv_1x1_s1_ic64_oc256",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "64_16_16_256_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 32,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 32,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 4096,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 2,
+        .n_tile_size_last = 128,
+        .height_tiles = 8,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 12,
+        .layer_name = "conv_1x1_s1_ic64_oc64",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "64_16_16_64_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 6,
+        .in_dim2_pitch = 96,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 6,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 6,
+        .out_dim2_pitch = 96,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 6144,
+        .coeff_buffer_size = 4096,
+        .output_buffer_size = 6144,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 3,
+        .output_rows = 6,
+        .input_rows = 6,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 13,
+        .layer_name = "conv_1x1_s1_ic256_oc64",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "256_16_16_64_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 32,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 32,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 2,
+        .n_tile_size_last = 32,
+        .height_tiles = 8,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 14,
+        .layer_name = "conv_1x1_s2_ic256_oc512",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "256_16_16_512_1_1_8_8_2_2_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 48,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12288,
+        .coeff_buffer_size = 4096,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 32,
+        .n_tile_size_last = 16,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 15,
+        .layer_name = "conv_1x1_s1_ic256_oc128",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "256_16_16_128_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 32,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 32,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 4,
+        .n_tile_size_last = 32,
+        .height_tiles = 8,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 16,
+        .layer_name = "conv_3x3_s2_ic128_oc128",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "128_16_16_128_3_3_8_8_2_2_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 90,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 4,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 11520,
+        .coeff_buffer_size = 4608,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 4,
+        .n_tiles = 32,
+        .n_tile_size_last = 4,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 17,
+        .layer_name = "conv_1x1_s1_ic128_oc512",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "128_8_8_512_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 128,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 8,
+        .n_tile_size_last = 64,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 18,
+        .layer_name = "conv_1x1_s1_ic512_oc128",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "512_8_8_128_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 8,
+        .n_tile_size_last = 16,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 19,
+        .layer_name = "conv_1x1_s2_ic512_oc1024",
+        .kernel_name = "1x1j2d1_no_dma",
+        .config_key = "512_8_8_1024_1_1_4_4_2_2_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 1024,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 8,
+        .in_dim2_pitch = 64,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 8,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 1024,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 1024,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 1024,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 1024,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 32768,
+        .coeff_buffer_size = 524288,
+        .output_buffer_size = 16384,
+        .bias_buffer_size = 4096,
+        .outscale_buffer_size = 2048,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 1024,
+        .n_tiles = 1,
+        .n_tile_size_last = 1024,
+        .height_tiles = 1,
+        .output_rows = 4,
+        .input_rows = 8,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 20,
+        .layer_name = "conv_1x1_s1_ic512_oc256",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "512_8_8_256_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 16,
+        .n_tile_size_last = 16,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 21,
+        .layer_name = "conv_3x3_s2_ic256_oc256",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "256_8_8_256_3_3_4_4_2_2_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 50,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 1,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12800,
+        .coeff_buffer_size = 2304,
+        .output_buffer_size = 8,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 1,
+        .n_tiles = 256,
+        .n_tile_size_last = 1,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 22,
+        .layer_name = "conv_1x1_s1_ic256_oc1024",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "256_4_4_1024_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 1024,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 1024,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 1024,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 1024,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 4096,
+        .outscale_buffer_size = 2048,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 32,
+        .n_tile_size_last = 32,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 23,
+        .layer_name = "conv_1x1_s1_ic1024_oc256",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "1024_4_4_256_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 32,
+        .n_tile_size_last = 8,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 24,
+        .layer_name = "conv_1x1_s2_ic1024_oc2048",
+        .kernel_name = "1x1j2d1_no_dma",
+        .config_key = "1024_4_4_2048_1_1_2_2_2_2_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 2048,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 2048,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 2048,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 2048,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 2048,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 16384,
+        .coeff_buffer_size = 2097152,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 8192,
+        .outscale_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 2048,
+        .n_tiles = 1,
+        .n_tile_size_last = 2048,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 25,
+        .layer_name = "conv_1x1_s1_ic1024_oc512",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "1024_4_4_512_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 64,
+        .n_tile_size_last = 8,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 26,
+        .layer_name = "conv_3x3_s2_ic512_oc512",
+        .kernel_name = "3x3j2d1_no_dma",
+        .config_key = "512_4_4_512_3_3_2_2_2_2_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 36,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 512,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 4608,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 18432,
+        .coeff_buffer_size = 2359296,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 512,
+        .n_tiles = 1,
+        .n_tile_size_last = 512,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 27,
+        .layer_name = "conv_1x1_s1_ic512_oc2048",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "512_2_2_2048_1_1_2_2_1_1_0_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 2048,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 2,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 4,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 2048,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 2048,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 2048,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 8192,
+        .outscale_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 128,
+        .n_tile_size_last = 16,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 28,
+        .layer_name = "conv_1x1_s1_ic2048_oc512",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "2048_2_2_512_1_1_2_2_1_1_0_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 2048,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 2,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 4,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 4,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 2048,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 2048,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 16,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 4,
+        .n_tiles = 128,
+        .n_tile_size_last = 4,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+};
+
+static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; }
+
+static inline const conv_layer_config_t* get_conv_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL;
+    return &CONV_LAYER_CONFIGS[layer_id];
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_params(
+    int ic, int ih, int iw,
+    int oc, int kh, int kw,
+    int oh, int ow,
+    int sy, int sx,
+    int pad, int dil)
+{
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->src_dim3_size == ic &&
+            cfg->src_dim2_size == ih &&
+            cfg->src_dim1_size == iw &&
+            cfg->dst_dim3_size == oc &&
+            cfg->coeff_dim2_size == kh &&
+            cfg->coeff_dim1_size == kw &&
+            cfg->dst_dim2_size == oh &&
+            cfg->dst_dim1_size == ow &&
+            cfg->stride_y == sy &&
+            cfg->stride_x == sx &&
+            cfg->padding == pad &&
+            cfg->dilation == dil)
+            return cfg;
+    }
+    return NULL;
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) {
+    if (config_key == NULL) return NULL;
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->config_key != NULL) {
+            const char* a = config_key;
+            const char* b = cfg->config_key;
+            while (*a && *b && *a == *b) { a++; b++; }
+            if (*a == '\0' && *b == '\0') return cfg;
+        }
+    }
+    return NULL;
+}
+
+/* ====================================================================== */
+/*  MaxPool configurations                                             */
+/* ====================================================================== */
+
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* config_key;
+
+    int src_width;  int src_height;  int channels;
+    int dst_width;  int dst_height;
+
+    int src_row_pitch;  int src_plane_pitch;
+    int dst_row_pitch;  int dst_plane_pitch;
+
+    int kernel_h;  int kernel_w;
+    int stride_h;  int stride_w;
+    int pad_h;     int pad_w;
+
+    int in_tile_w;      int in_tile_rows;   int in_tile_plane;
+    int in_data_offset;
+    int out_tile_w;     int out_tile_rows;  int out_tile_plane;
+
+    int c_tile_size;  int c_tiles;  int c_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int input_buffer_size;  int output_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;
+    int output_ping_dram; int output_pong_dram;
+} maxpool_layer_config_t;
+
+#define NUM_MAXPOOL_LAYERS 1
+
+static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = {
+    {
+        .layer_id = 0,
+        .layer_name = "maxpool_3x3s2_c64_32x32",
+        .config_key = "64_32_32_3_3_2_2_1_1",
+        .src_width = 32,
+        .src_height = 32,
+        .channels = 64,
+        .dst_width = 16,
+        .dst_height = 16,
+        .src_row_pitch = 32,
+        .src_plane_pitch = 1024,
+        .dst_row_pitch = 16,
+        .dst_plane_pitch = 256,
+        .kernel_h = 3,
+        .kernel_w = 3,
+        .stride_h = 2,
+        .stride_w = 2,
+        .pad_h = 1,
+        .pad_w = 1,
+        .in_tile_w = 34,
+        .in_tile_rows = 5,
+        .in_tile_plane = 170,
+        .in_data_offset = 35,
+        .out_tile_w = 16,
+        .out_tile_rows = 1,
+        .out_tile_plane = 16,
+        .c_tile_size = 22,
+        .c_tiles = 3,
+        .c_tile_size_last = 20,
+        .height_tiles = 16,
+        .output_rows = 1,
+        .input_rows = 3,
+        .input_buffer_size = 14960,
+        .output_buffer_size = 1408,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 0,
+    },
+};
+
+static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; }
+
+static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL;
+    return &MAXPOOL_LAYER_CONFIGS[layer_id];
+}
+
+static inline const maxpool_layer_config_t* get_maxpool_config_by_params(
+    int channels, int src_height, int src_width,
+    int kernel_h, int kernel_w,
+    int stride_h, int stride_w,
+    int pad_h, int pad_w)
+{
+    for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) {
+        const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i];
+        if (c->channels   == channels   &&
+            c->src_height == src_height &&
+            c->src_width  == src_width  &&
+            c->kernel_h   == kernel_h   &&
+            c->kernel_w   == kernel_w   &&
+            c->stride_h   == stride_h   &&
+            c->stride_w   == stride_w   &&
+            c->pad_h      == pad_h      &&
+            c->pad_w      == pad_w)
+            return c;
+    }
+    return NULL;
+}
+
+#endif /* LAYER_CONFIGS_H */
diff --git a/backends/cadence/vision/config_generator/layer_configs_24k.h b/backends/cadence/vision/config_generator/layer_configs_24k.h
new file mode 100644
index 00000000000..0a78e6165f5
--- /dev/null
+++ b/backends/cadence/vision/config_generator/layer_configs_24k.h
@@ -0,0 +1,2403 @@
+/*
+ * layer_configs.h
+ *
+ * Auto-generated conv2d + maxpool layer configurations
+ * Generated from PTE extraction by generate_combined_configs.py
+ *
+ * DO NOT EDIT MANUALLY
+ */
+
+#ifndef LAYER_CONFIGS_H
+#define LAYER_CONFIGS_H
+
+#include <stdint.h>
+#include <stddef.h>  /* for NULL */
+
+#define IDMA_BUFFER_SIZE_DRAM0 (24576)  /* 24 KB */
+#define IDMA_BUFFER_SIZE_DRAM1 (24576)  /* 24 KB */
+
+/* ====================================================================== */
+/*  Conv2d configurations                                              */
+/* ====================================================================== */
+
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* kernel_name;
+    const char* config_key;
+
+    int src_dim1_size;  int src_dim2_size;  int src_dim3_size;
+    int src_dim1_pitch; int src_dim2_pitch;
+
+    int dst_dim1_size;  int dst_dim2_size;  int dst_dim3_size;
+    int dst_dim1_pitch; int dst_dim2_pitch;
+
+    int in_dim1_size;   int in_dim1_pitch;
+    int in_dim2_size;   int in_dim2_pitch;
+    int in_dim1_edge1;  int in_dim1_edge2;
+    int in_dim2_edge1;  int in_dim2_edge2;
+    int in_dim3_edge1;  int in_dim3_edge2;
+    int in_data_offset; int in_rows_firstdma;
+
+    int out_dim1_size;  int out_dim1_pitch;
+    int out_dim2_size;  int out_dim2_pitch;
+    int out_dim3_size;
+
+    int coeff_dim1_size;  int coeff_dim2_size;
+    int coeff_dim3_size;  int coeff_dim4_size;
+    int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch;
+
+    int bias_dim1_size;     int bias_dim2_size;
+    int outscale_dim1_size; int outscale_dim2_size;
+
+    int input_buffer_size;  int coeff_buffer_size;  int output_buffer_size;
+    int bias_buffer_size;   int outscale_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;  int coeff_dram;
+    int output_ping_dram; int output_pong_dram;
+    int bias_dram;        int outscale_dram;
+
+    int n_tile_size; int n_tiles; int n_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int kernel_w; int kernel_h;
+    int stride_x; int stride_y;
+    int padding;  int dilation;
+    int accum_shift; int relu_max; int relu_min;
+    int output_shift; int output_scale; int flags;
+    int input_zero_point;
+} conv_layer_config_t;
+
+#define NUM_CONV_LAYERS 29
+
+static const conv_layer_config_t CONV_LAYER_CONFIGS[] = {
+    {
+        .layer_id = 0,
+        .layer_name = "conv_7x7_s2_ic3_oc64",
+        .kernel_name = "7x7j2d1_dma",
+        .config_key = "3_64_64_64_7_7_32_32_2_2_3_1",
+        .src_dim1_size = 64,
+        .src_dim2_size = 64,
+        .src_dim3_size = 3,
+        .src_dim1_pitch = 64,
+        .src_dim2_pitch = 4096,
+        .dst_dim1_size = 32,
+        .dst_dim2_size = 32,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 32,
+        .dst_dim2_pitch = 1024,
+        .in_dim1_size = 64,
+        .in_dim1_pitch = 70,
+        .in_dim2_size = 17,
+        .in_dim2_pitch = 1190,
+        .in_dim1_edge1 = 3,
+        .in_dim1_edge2 = 3,
+        .in_dim2_edge1 = 3,
+        .in_dim2_edge2 = 3,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 213,
+        .in_rows_firstdma = 14,
+        .out_dim1_size = 32,
+        .out_dim1_pitch = 32,
+        .out_dim2_size = 6,
+        .out_dim2_pitch = 192,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 7,
+        .coeff_dim2_size = 7,
+        .coeff_dim3_size = 3,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 7,
+        .coeff_dim2_pitch = 49,
+        .coeff_dim3_pitch = 147,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 3570,
+        .coeff_buffer_size = 9408,
+        .output_buffer_size = 12288,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 6,
+        .output_rows = 6,
+        .input_rows = 17,
+        .kernel_w = 7,
+        .kernel_h = 7,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 3,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 1,
+        .layer_name = "conv_3x3_s1_ic64_oc64",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "64_16_16_64_3_3_16_16_1_1_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 72,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 32,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 576,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 4608,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 2,
+        .n_tile_size_last = 32,
+        .height_tiles = 8,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 2,
+        .layer_name = "conv_1x1_s2_ic64_oc128",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "64_16_16_128_1_1_8_8_2_2_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 11,
+        .in_dim2_pitch = 176,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 11,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 6,
+        .out_dim2_pitch = 48,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 11264,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 6144,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 1,
+        .n_tile_size_last = 128,
+        .height_tiles = 2,
+        .output_rows = 6,
+        .input_rows = 11,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 3,
+        .layer_name = "conv_3x3_s2_ic64_oc128",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "64_16_16_128_3_3_8_8_2_2_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 90,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 576,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 5760,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 4,
+        .n_tile_size_last = 32,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 4,
+        .layer_name = "conv_3x3_s1_ic128_oc128",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "128_8_8_128_3_3_8_8_1_1_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 40,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 5120,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 8,
+        .n_tile_size_last = 16,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 5,
+        .layer_name = "conv_1x1_s2_ic128_oc256",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "128_8_8_256_1_1_4_4_2_2_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 24,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 128,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 3072,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 2,
+        .n_tile_size_last = 128,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 6,
+        .layer_name = "conv_3x3_s2_ic128_oc256",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "128_8_8_256_3_3_4_4_2_2_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 50,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 6400,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 16,
+        .n_tile_size_last = 16,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 7,
+        .layer_name = "conv_3x3_s1_ic256_oc256",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "256_4_4_256_3_3_4_4_1_1_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 24,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 7,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 6144,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 32,
+        .n_tile_size_last = 8,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 8,
+        .layer_name = "conv_1x1_s2_ic256_oc512",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "256_4_4_512_1_1_2_2_2_2_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 12,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 3072,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 8,
+        .n_tile_size_last = 64,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 9,
+        .layer_name = "conv_3x3_s2_ic256_oc512",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "256_4_4_512_3_3_2_2_2_2_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 30,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 7,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 7680,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 32,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 64,
+        .n_tile_size_last = 8,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 10,
+        .layer_name = "conv_3x3_s1_ic512_oc512",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "512_2_2_512_3_3_2_2_1_1_1_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 5,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 4,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 4608,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 16,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 4,
+        .n_tiles = 128,
+        .n_tile_size_last = 4,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 11,
+        .layer_name = "conv_1x1_s1_ic64_oc256",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "64_16_16_256_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 48,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 3,
+        .out_dim2_pitch = 48,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 3072,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 12288,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 6,
+        .output_rows = 3,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 12,
+        .layer_name = "conv_1x1_s1_ic64_oc64",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "64_16_16_64_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 10,
+        .in_dim2_pitch = 160,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 10,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 10,
+        .out_dim2_pitch = 160,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 10240,
+        .coeff_buffer_size = 4096,
+        .output_buffer_size = 10240,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 2,
+        .output_rows = 10,
+        .input_rows = 10,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 13,
+        .layer_name = "conv_1x1_s1_ic256_oc64",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "256_16_16_64_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 48,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 3,
+        .out_dim2_pitch = 48,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12288,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 3072,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 6,
+        .output_rows = 3,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 14,
+        .layer_name = "conv_1x1_s2_ic256_oc512",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "256_16_16_512_1_1_8_8_2_2_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 48,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12288,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 8,
+        .n_tile_size_last = 64,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 15,
+        .layer_name = "conv_1x1_s1_ic256_oc128",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "256_16_16_128_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 32,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 32,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 2,
+        .n_tile_size_last = 64,
+        .height_tiles = 8,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 16,
+        .layer_name = "conv_3x3_s2_ic128_oc128",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "128_16_16_128_3_3_8_8_2_2_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 90,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 11520,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 8,
+        .n_tile_size_last = 16,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 17,
+        .layer_name = "conv_1x1_s1_ic128_oc512",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "128_8_8_512_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 128,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 4,
+        .n_tile_size_last = 128,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 18,
+        .layer_name = "conv_1x1_s1_ic512_oc128",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "512_8_8_128_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 4,
+        .n_tile_size_last = 32,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 19,
+        .layer_name = "conv_1x1_s2_ic512_oc1024",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "512_8_8_1024_1_1_4_4_2_2_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 1024,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 24,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 1024,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 1024,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 1024,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12288,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 4096,
+        .outscale_buffer_size = 2048,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 32,
+        .n_tile_size_last = 32,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 20,
+        .layer_name = "conv_1x1_s1_ic512_oc256",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "512_8_8_256_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 8,
+        .n_tile_size_last = 32,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 21,
+        .layer_name = "conv_3x3_s2_ic256_oc256",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "256_8_8_256_3_3_4_4_2_2_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 50,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 4,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12800,
+        .coeff_buffer_size = 9216,
+        .output_buffer_size = 32,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 4,
+        .n_tiles = 64,
+        .n_tile_size_last = 4,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 22,
+        .layer_name = "conv_1x1_s1_ic256_oc1024",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "256_4_4_1024_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 1024,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 1024,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 1024,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 1024,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 4096,
+        .outscale_buffer_size = 2048,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 16,
+        .n_tile_size_last = 64,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 23,
+        .layer_name = "conv_1x1_s1_ic1024_oc256",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "1024_4_4_256_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 16,
+        .n_tile_size_last = 16,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 24,
+        .layer_name = "conv_1x1_s2_ic1024_oc2048",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "1024_4_4_2048_1_1_2_2_2_2_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 2048,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 12,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 2048,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 2048,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 2048,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12288,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 32,
+        .bias_buffer_size = 8192,
+        .outscale_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 256,
+        .n_tile_size_last = 8,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 25,
+        .layer_name = "conv_1x1_s1_ic1024_oc512",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "1024_4_4_512_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 32,
+        .n_tile_size_last = 16,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 26,
+        .layer_name = "conv_3x3_s2_ic512_oc512",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "512_4_4_512_3_3_2_2_2_2_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 30,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 7,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 2,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 4608,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 15360,
+        .coeff_buffer_size = 9216,
+        .output_buffer_size = 8,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 2,
+        .n_tiles = 256,
+        .n_tile_size_last = 2,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 27,
+        .layer_name = "conv_1x1_s1_ic512_oc2048",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "512_2_2_2048_1_1_2_2_1_1_0_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 2048,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 2,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 4,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 2048,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 2048,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 2048,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 8192,
+        .outscale_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 64,
+        .n_tile_size_last = 32,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 28,
+        .layer_name = "conv_1x1_s1_ic2048_oc512",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "2048_2_2_512_1_1_2_2_1_1_0_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 2048,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 2,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 4,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 2048,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 2048,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 32,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 64,
+        .n_tile_size_last = 8,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+};
+
+static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; }
+
+static inline const conv_layer_config_t* get_conv_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL;
+    return &CONV_LAYER_CONFIGS[layer_id];
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_params(
+    int ic, int ih, int iw,
+    int oc, int kh, int kw,
+    int oh, int ow,
+    int sy, int sx,
+    int pad, int dil)
+{
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->src_dim3_size == ic &&
+            cfg->src_dim2_size == ih &&
+            cfg->src_dim1_size == iw &&
+            cfg->dst_dim3_size == oc &&
+            cfg->coeff_dim2_size == kh &&
+            cfg->coeff_dim1_size == kw &&
+            cfg->dst_dim2_size == oh &&
+            cfg->dst_dim1_size == ow &&
+            cfg->stride_y == sy &&
+            cfg->stride_x == sx &&
+            cfg->padding == pad &&
+            cfg->dilation == dil)
+            return cfg;
+    }
+    return NULL;
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) {
+    if (config_key == NULL) return NULL;
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->config_key != NULL) {
+            const char* a = config_key;
+            const char* b = cfg->config_key;
+            while (*a && *b && *a == *b) { a++; b++; }
+            if (*a == '\0' && *b == '\0') return cfg;
+        }
+    }
+    return NULL;
+}
+
+/* ====================================================================== */
+/*  MaxPool configurations                                             */
+/* ====================================================================== */
+
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* config_key;
+
+    int src_width;  int src_height;  int channels;
+    int dst_width;  int dst_height;
+
+    int src_row_pitch;  int src_plane_pitch;
+    int dst_row_pitch;  int dst_plane_pitch;
+
+    int kernel_h;  int kernel_w;
+    int stride_h;  int stride_w;
+    int pad_h;     int pad_w;
+
+    int in_tile_w;      int in_tile_rows;   int in_tile_plane;
+    int in_data_offset;
+    int out_tile_w;     int out_tile_rows;  int out_tile_plane;
+
+    int c_tile_size;  int c_tiles;  int c_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int input_buffer_size;  int output_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;
+    int output_ping_dram; int output_pong_dram;
+} maxpool_layer_config_t;
+
+#define NUM_MAXPOOL_LAYERS 1
+
+static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = {
+    {
+        .layer_id = 0,
+        .layer_name = "maxpool_3x3s2_c64_32x32",
+        .config_key = "64_32_32_3_3_2_2_1_1",
+        .src_width = 32,
+        .src_height = 32,
+        .channels = 64,
+        .dst_width = 16,
+        .dst_height = 16,
+        .src_row_pitch = 32,
+        .src_plane_pitch = 1024,
+        .dst_row_pitch = 16,
+        .dst_plane_pitch = 256,
+        .kernel_h = 3,
+        .kernel_w = 3,
+        .stride_h = 2,
+        .stride_w = 2,
+        .pad_h = 1,
+        .pad_w = 1,
+        .in_tile_w = 34,
+        .in_tile_rows = 5,
+        .in_tile_plane = 170,
+        .in_data_offset = 35,
+        .out_tile_w = 16,
+        .out_tile_rows = 1,
+        .out_tile_plane = 16,
+        .c_tile_size = 33,
+        .c_tiles = 2,
+        .c_tile_size_last = 31,
+        .height_tiles = 16,
+        .output_rows = 1,
+        .input_rows = 3,
+        .input_buffer_size = 22440,
+        .output_buffer_size = 2112,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 0,
+    },
+};
+
+static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; }
+
+static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL;
+    return &MAXPOOL_LAYER_CONFIGS[layer_id];
+}
+
+static inline const maxpool_layer_config_t* get_maxpool_config_by_params(
+    int channels, int src_height, int src_width,
+    int kernel_h, int kernel_w,
+    int stride_h, int stride_w,
+    int pad_h, int pad_w)
+{
+    for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) {
+        const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i];
+        if (c->channels   == channels   &&
+            c->src_height == src_height &&
+            c->src_width  == src_width  &&
+            c->kernel_h   == kernel_h   &&
+            c->kernel_w   == kernel_w   &&
+            c->stride_h   == stride_h   &&
+            c->stride_w   == stride_w   &&
+            c->pad_h      == pad_h      &&
+            c->pad_w      == pad_w)
+            return c;
+    }
+    return NULL;
+}
+
+#endif /* LAYER_CONFIGS_H */
diff --git a/backends/cadence/vision/config_generator/layer_configs_32k.h b/backends/cadence/vision/config_generator/layer_configs_32k.h
new file mode 100644
index 00000000000..efa2c7a64ee
--- /dev/null
+++ b/backends/cadence/vision/config_generator/layer_configs_32k.h
@@ -0,0 +1,2403 @@
+/*
+ * layer_configs.h
+ *
+ * Auto-generated conv2d + maxpool layer configurations
+ * Generated from PTE extraction by generate_combined_configs.py
+ *
+ * DO NOT EDIT MANUALLY
+ */
+
+#ifndef LAYER_CONFIGS_H
+#define LAYER_CONFIGS_H
+
+#include <stdint.h>
+#include <stddef.h>  /* for NULL */
+
+#define IDMA_BUFFER_SIZE_DRAM0 (32768)  /* 32 KB */
+#define IDMA_BUFFER_SIZE_DRAM1 (32768)  /* 32 KB */
+
+/* ====================================================================== */
+/*  Conv2d configurations                                              */
+/* ====================================================================== */
+
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* kernel_name;
+    const char* config_key;
+
+    int src_dim1_size;  int src_dim2_size;  int src_dim3_size;
+    int src_dim1_pitch; int src_dim2_pitch;
+
+    int dst_dim1_size;  int dst_dim2_size;  int dst_dim3_size;
+    int dst_dim1_pitch; int dst_dim2_pitch;
+
+    int in_dim1_size;   int in_dim1_pitch;
+    int in_dim2_size;   int in_dim2_pitch;
+    int in_dim1_edge1;  int in_dim1_edge2;
+    int in_dim2_edge1;  int in_dim2_edge2;
+    int in_dim3_edge1;  int in_dim3_edge2;
+    int in_data_offset; int in_rows_firstdma;
+
+    int out_dim1_size;  int out_dim1_pitch;
+    int out_dim2_size;  int out_dim2_pitch;
+    int out_dim3_size;
+
+    int coeff_dim1_size;  int coeff_dim2_size;
+    int coeff_dim3_size;  int coeff_dim4_size;
+    int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch;
+
+    int bias_dim1_size;     int bias_dim2_size;
+    int outscale_dim1_size; int outscale_dim2_size;
+
+    int input_buffer_size;  int coeff_buffer_size;  int output_buffer_size;
+    int bias_buffer_size;   int outscale_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;  int coeff_dram;
+    int output_ping_dram; int output_pong_dram;
+    int bias_dram;        int outscale_dram;
+
+    int n_tile_size; int n_tiles; int n_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int kernel_w; int kernel_h;
+    int stride_x; int stride_y;
+    int padding;  int dilation;
+    int accum_shift; int relu_max; int relu_min;
+    int output_shift; int output_scale; int flags;
+    int input_zero_point;
+} conv_layer_config_t;
+
+#define NUM_CONV_LAYERS 29
+
+static const conv_layer_config_t CONV_LAYER_CONFIGS[] = {
+    {
+        .layer_id = 0,
+        .layer_name = "conv_7x7_s2_ic3_oc64",
+        .kernel_name = "7x7j2d1_dma",
+        .config_key = "3_64_64_64_7_7_32_32_2_2_3_1",
+        .src_dim1_size = 64,
+        .src_dim2_size = 64,
+        .src_dim3_size = 3,
+        .src_dim1_pitch = 64,
+        .src_dim2_pitch = 4096,
+        .dst_dim1_size = 32,
+        .dst_dim2_size = 32,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 32,
+        .dst_dim2_pitch = 1024,
+        .in_dim1_size = 64,
+        .in_dim1_pitch = 70,
+        .in_dim2_size = 21,
+        .in_dim2_pitch = 1470,
+        .in_dim1_edge1 = 3,
+        .in_dim1_edge2 = 3,
+        .in_dim2_edge1 = 3,
+        .in_dim2_edge2 = 3,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 213,
+        .in_rows_firstdma = 18,
+        .out_dim1_size = 32,
+        .out_dim1_pitch = 32,
+        .out_dim2_size = 8,
+        .out_dim2_pitch = 256,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 7,
+        .coeff_dim2_size = 7,
+        .coeff_dim3_size = 3,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 7,
+        .coeff_dim2_pitch = 49,
+        .coeff_dim3_pitch = 147,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 4410,
+        .coeff_buffer_size = 9408,
+        .output_buffer_size = 16384,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 4,
+        .output_rows = 8,
+        .input_rows = 21,
+        .kernel_w = 7,
+        .kernel_h = 7,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 3,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 1,
+        .layer_name = "conv_3x3_s1_ic64_oc64",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "64_16_16_64_3_3_16_16_1_1_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 72,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 32,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 576,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 4608,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 2,
+        .n_tile_size_last = 32,
+        .height_tiles = 8,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 2,
+        .layer_name = "conv_1x1_s2_ic64_oc128",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "64_16_16_128_1_1_8_8_2_2_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 15,
+        .in_dim2_pitch = 240,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 15,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 8,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 15360,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 1,
+        .n_tile_size_last = 128,
+        .height_tiles = 1,
+        .output_rows = 8,
+        .input_rows = 15,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 3,
+        .layer_name = "conv_3x3_s2_ic64_oc128",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "64_16_16_128_3_3_8_8_2_2_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 90,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 576,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 5760,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 4,
+        .n_tile_size_last = 32,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 4,
+        .layer_name = "conv_3x3_s1_ic128_oc128",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "128_8_8_128_3_3_8_8_1_1_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 40,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 5120,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 8,
+        .n_tile_size_last = 16,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 5,
+        .layer_name = "conv_1x1_s2_ic128_oc256",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "128_8_8_256_1_1_4_4_2_2_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 24,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 128,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 3072,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 2,
+        .n_tile_size_last = 128,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 6,
+        .layer_name = "conv_3x3_s2_ic128_oc256",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "128_8_8_256_3_3_4_4_2_2_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 50,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 6400,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 16,
+        .n_tile_size_last = 16,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 7,
+        .layer_name = "conv_3x3_s1_ic256_oc256",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "256_4_4_256_3_3_4_4_1_1_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 24,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 7,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 6144,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 32,
+        .n_tile_size_last = 8,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 8,
+        .layer_name = "conv_1x1_s2_ic256_oc512",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "256_4_4_512_1_1_2_2_2_2_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 12,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 3072,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 8,
+        .n_tile_size_last = 64,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 9,
+        .layer_name = "conv_3x3_s2_ic256_oc512",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "256_4_4_512_3_3_2_2_2_2_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 30,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 7,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 7680,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 32,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 64,
+        .n_tile_size_last = 8,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 10,
+        .layer_name = "conv_3x3_s1_ic512_oc512",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "512_2_2_512_3_3_2_2_1_1_1_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 5,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 4,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 4608,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 16,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 4,
+        .n_tiles = 128,
+        .n_tile_size_last = 4,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 11,
+        .layer_name = "conv_1x1_s1_ic64_oc256",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "64_16_16_256_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 64,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 4096,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 16384,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 4,
+        .output_rows = 4,
+        .input_rows = 4,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 12,
+        .layer_name = "conv_1x1_s1_ic64_oc64",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "64_16_16_64_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 14,
+        .in_dim2_pitch = 224,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 14,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 14,
+        .out_dim2_pitch = 224,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 14336,
+        .coeff_buffer_size = 4096,
+        .output_buffer_size = 14336,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 2,
+        .output_rows = 14,
+        .input_rows = 14,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 13,
+        .layer_name = "conv_1x1_s1_ic256_oc64",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "256_16_16_64_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 64,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 16384,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 4096,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 4,
+        .output_rows = 4,
+        .input_rows = 4,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 14,
+        .layer_name = "conv_1x1_s2_ic256_oc512",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "256_16_16_512_1_1_8_8_2_2_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 48,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12288,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 8,
+        .n_tile_size_last = 64,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 15,
+        .layer_name = "conv_1x1_s1_ic256_oc128",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "256_16_16_128_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 32,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 32,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 2,
+        .n_tile_size_last = 64,
+        .height_tiles = 8,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 16,
+        .layer_name = "conv_3x3_s2_ic128_oc128",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "128_16_16_128_3_3_8_8_2_2_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 90,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 11520,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 8,
+        .n_tile_size_last = 16,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 17,
+        .layer_name = "conv_1x1_s1_ic128_oc512",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "128_8_8_512_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 128,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 4,
+        .n_tile_size_last = 128,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 18,
+        .layer_name = "conv_1x1_s1_ic512_oc128",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "512_8_8_128_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 4,
+        .n_tile_size_last = 32,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 19,
+        .layer_name = "conv_1x1_s2_ic512_oc1024",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "512_8_8_1024_1_1_4_4_2_2_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 1024,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 24,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 1024,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 1024,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 1024,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12288,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 4096,
+        .outscale_buffer_size = 2048,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 32,
+        .n_tile_size_last = 32,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 20,
+        .layer_name = "conv_1x1_s1_ic512_oc256",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "512_8_8_256_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 8,
+        .n_tile_size_last = 32,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 21,
+        .layer_name = "conv_3x3_s2_ic256_oc256",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "256_8_8_256_3_3_4_4_2_2_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 50,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12800,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 32,
+        .n_tile_size_last = 8,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 22,
+        .layer_name = "conv_1x1_s1_ic256_oc1024",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "256_4_4_1024_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 1024,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 1024,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 1024,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 1024,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 4096,
+        .outscale_buffer_size = 2048,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 16,
+        .n_tile_size_last = 64,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 23,
+        .layer_name = "conv_1x1_s1_ic1024_oc256",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "1024_4_4_256_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 16,
+        .n_tile_size_last = 16,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 24,
+        .layer_name = "conv_1x1_s2_ic1024_oc2048",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "1024_4_4_2048_1_1_2_2_2_2_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 2048,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 12,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 2048,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 2048,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 2048,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12288,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 8192,
+        .outscale_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 128,
+        .n_tile_size_last = 16,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 25,
+        .layer_name = "conv_1x1_s1_ic1024_oc512",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "1024_4_4_512_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 32,
+        .n_tile_size_last = 16,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 26,
+        .layer_name = "conv_3x3_s2_ic512_oc512",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "512_4_4_512_3_3_2_2_2_2_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 30,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 7,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 4,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 4608,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 15360,
+        .coeff_buffer_size = 18432,
+        .output_buffer_size = 16,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 4,
+        .n_tiles = 128,
+        .n_tile_size_last = 4,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 27,
+        .layer_name = "conv_1x1_s1_ic512_oc2048",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "512_2_2_2048_1_1_2_2_1_1_0_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 2048,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 2,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 4,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 2048,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 2048,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 2048,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 8192,
+        .outscale_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 64,
+        .n_tile_size_last = 32,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 28,
+        .layer_name = "conv_1x1_s1_ic2048_oc512",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "2048_2_2_512_1_1_2_2_1_1_0_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 2048,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 2,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 4,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 2048,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 2048,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 32,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 64,
+        .n_tile_size_last = 8,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+};
+
+static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; }
+
+static inline const conv_layer_config_t* get_conv_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL;
+    return &CONV_LAYER_CONFIGS[layer_id];
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_params(
+    int ic, int ih, int iw,
+    int oc, int kh, int kw,
+    int oh, int ow,
+    int sy, int sx,
+    int pad, int dil)
+{
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->src_dim3_size == ic &&
+            cfg->src_dim2_size == ih &&
+            cfg->src_dim1_size == iw &&
+            cfg->dst_dim3_size == oc &&
+            cfg->coeff_dim2_size == kh &&
+            cfg->coeff_dim1_size == kw &&
+            cfg->dst_dim2_size == oh &&
+            cfg->dst_dim1_size == ow &&
+            cfg->stride_y == sy &&
+            cfg->stride_x == sx &&
+            cfg->padding == pad &&
+            cfg->dilation == dil)
+            return cfg;
+    }
+    return NULL;
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) {
+    if (config_key == NULL) return NULL;
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->config_key != NULL) {
+            const char* a = config_key;
+            const char* b = cfg->config_key;
+            while (*a && *b && *a == *b) { a++; b++; }
+            if (*a == '\0' && *b == '\0') return cfg;
+        }
+    }
+    return NULL;
+}
+
+/* ====================================================================== */
+/*  MaxPool configurations                                             */
+/* ====================================================================== */
+
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* config_key;
+
+    int src_width;  int src_height;  int channels;
+    int dst_width;  int dst_height;
+
+    int src_row_pitch;  int src_plane_pitch;
+    int dst_row_pitch;  int dst_plane_pitch;
+
+    int kernel_h;  int kernel_w;
+    int stride_h;  int stride_w;
+    int pad_h;     int pad_w;
+
+    int in_tile_w;      int in_tile_rows;   int in_tile_plane;
+    int in_data_offset;
+    int out_tile_w;     int out_tile_rows;  int out_tile_plane;
+
+    int c_tile_size;  int c_tiles;  int c_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int input_buffer_size;  int output_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;
+    int output_ping_dram; int output_pong_dram;
+} maxpool_layer_config_t;
+
+#define NUM_MAXPOOL_LAYERS 1
+
+static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = {
+    {
+        .layer_id = 0,
+        .layer_name = "maxpool_3x3s2_c64_32x32",
+        .config_key = "64_32_32_3_3_2_2_1_1",
+        .src_width = 32,
+        .src_height = 32,
+        .channels = 64,
+        .dst_width = 16,
+        .dst_height = 16,
+        .src_row_pitch = 32,
+        .src_plane_pitch = 1024,
+        .dst_row_pitch = 16,
+        .dst_plane_pitch = 256,
+        .kernel_h = 3,
+        .kernel_w = 3,
+        .stride_h = 2,
+        .stride_w = 2,
+        .pad_h = 1,
+        .pad_w = 1,
+        .in_tile_w = 34,
+        .in_tile_rows = 5,
+        .in_tile_plane = 170,
+        .in_data_offset = 35,
+        .out_tile_w = 16,
+        .out_tile_rows = 1,
+        .out_tile_plane = 16,
+        .c_tile_size = 44,
+        .c_tiles = 2,
+        .c_tile_size_last = 20,
+        .height_tiles = 16,
+        .output_rows = 1,
+        .input_rows = 3,
+        .input_buffer_size = 29920,
+        .output_buffer_size = 2816,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 0,
+    },
+};
+
+static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; }
+
+static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL;
+    return &MAXPOOL_LAYER_CONFIGS[layer_id];
+}
+
+static inline const maxpool_layer_config_t* get_maxpool_config_by_params(
+    int channels, int src_height, int src_width,
+    int kernel_h, int kernel_w,
+    int stride_h, int stride_w,
+    int pad_h, int pad_w)
+{
+    for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) {
+        const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i];
+        if (c->channels   == channels   &&
+            c->src_height == src_height &&
+            c->src_width  == src_width  &&
+            c->kernel_h   == kernel_h   &&
+            c->kernel_w   == kernel_w   &&
+            c->stride_h   == stride_h   &&
+            c->stride_w   == stride_w   &&
+            c->pad_h      == pad_h      &&
+            c->pad_w      == pad_w)
+            return c;
+    }
+    return NULL;
+}
+
+#endif /* LAYER_CONFIGS_H */
diff --git a/backends/cadence/vision/config_generator/layer_configs_4k.h b/backends/cadence/vision/config_generator/layer_configs_4k.h
new file mode 100644
index 00000000000..481adbd63b9
--- /dev/null
+++ b/backends/cadence/vision/config_generator/layer_configs_4k.h
@@ -0,0 +1,2403 @@
+/*
+ * layer_configs.h
+ *
+ * Auto-generated conv2d + maxpool layer configurations
+ * Generated from PTE extraction by generate_combined_configs.py
+ *
+ * DO NOT EDIT MANUALLY
+ */
+
+#ifndef LAYER_CONFIGS_H
+#define LAYER_CONFIGS_H
+
+#include <stdint.h>
+#include <stddef.h>  /* for NULL */
+
+#define IDMA_BUFFER_SIZE_DRAM0 (4096)  /* 4 KB */
+#define IDMA_BUFFER_SIZE_DRAM1 (4096)  /* 4 KB */
+
+/* ====================================================================== */
+/*  Conv2d configurations                                              */
+/* ====================================================================== */
+
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* kernel_name;
+    const char* config_key;
+
+    int src_dim1_size;  int src_dim2_size;  int src_dim3_size;
+    int src_dim1_pitch; int src_dim2_pitch;
+
+    int dst_dim1_size;  int dst_dim2_size;  int dst_dim3_size;
+    int dst_dim1_pitch; int dst_dim2_pitch;
+
+    int in_dim1_size;   int in_dim1_pitch;
+    int in_dim2_size;   int in_dim2_pitch;
+    int in_dim1_edge1;  int in_dim1_edge2;
+    int in_dim2_edge1;  int in_dim2_edge2;
+    int in_dim3_edge1;  int in_dim3_edge2;
+    int in_data_offset; int in_rows_firstdma;
+
+    int out_dim1_size;  int out_dim1_pitch;
+    int out_dim2_size;  int out_dim2_pitch;
+    int out_dim3_size;
+
+    int coeff_dim1_size;  int coeff_dim2_size;
+    int coeff_dim3_size;  int coeff_dim4_size;
+    int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch;
+
+    int bias_dim1_size;     int bias_dim2_size;
+    int outscale_dim1_size; int outscale_dim2_size;
+
+    int input_buffer_size;  int coeff_buffer_size;  int output_buffer_size;
+    int bias_buffer_size;   int outscale_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;  int coeff_dram;
+    int output_ping_dram; int output_pong_dram;
+    int bias_dram;        int outscale_dram;
+
+    int n_tile_size; int n_tiles; int n_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int kernel_w; int kernel_h;
+    int stride_x; int stride_y;
+    int padding;  int dilation;
+    int accum_shift; int relu_max; int relu_min;
+    int output_shift; int output_scale; int flags;
+    int input_zero_point;
+} conv_layer_config_t;
+
+#define NUM_CONV_LAYERS 29
+
+static const conv_layer_config_t CONV_LAYER_CONFIGS[] = {
+    {
+        .layer_id = 0,
+        .layer_name = "conv_7x7_s2_ic3_oc64",
+        .kernel_name = "7x7j2d1_dma",
+        .config_key = "3_64_64_64_7_7_32_32_2_2_3_1",
+        .src_dim1_size = 64,
+        .src_dim2_size = 64,
+        .src_dim3_size = 3,
+        .src_dim1_pitch = 64,
+        .src_dim2_pitch = 4096,
+        .dst_dim1_size = 32,
+        .dst_dim2_size = 32,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 32,
+        .dst_dim2_pitch = 1024,
+        .in_dim1_size = 64,
+        .in_dim1_pitch = 70,
+        .in_dim2_size = 9,
+        .in_dim2_pitch = 630,
+        .in_dim1_edge1 = 3,
+        .in_dim1_edge2 = 3,
+        .in_dim2_edge1 = 3,
+        .in_dim2_edge2 = 3,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 213,
+        .in_rows_firstdma = 6,
+        .out_dim1_size = 32,
+        .out_dim1_pitch = 32,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 7,
+        .coeff_dim2_size = 7,
+        .coeff_dim3_size = 3,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 7,
+        .coeff_dim2_pitch = 49,
+        .coeff_dim3_pitch = 147,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 1890,
+        .coeff_buffer_size = 1176,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 8,
+        .n_tile_size_last = 8,
+        .height_tiles = 16,
+        .output_rows = 2,
+        .input_rows = 9,
+        .kernel_w = 7,
+        .kernel_h = 7,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 3,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 1,
+        .layer_name = "conv_3x3_s1_ic64_oc64",
+        .kernel_name = "3x3j1d1_no_dma",
+        .config_key = "64_16_16_64_3_3_16_16_1_1_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 16,
+        .in_dim2_pitch = 324,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 16,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 16,
+        .out_dim2_pitch = 256,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 576,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 20736,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 16384,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 1,
+        .output_rows = 16,
+        .input_rows = 16,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 2,
+        .layer_name = "conv_1x1_s2_ic64_oc128",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "64_16_16_128_1_1_8_8_2_2_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 48,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 3072,
+        .coeff_buffer_size = 512,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 16,
+        .n_tile_size_last = 8,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 3,
+        .layer_name = "conv_3x3_s2_ic64_oc128",
+        .kernel_name = "3x3j2d1_no_dma",
+        .config_key = "64_16_16_128_3_3_8_8_2_2_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 16,
+        .in_dim2_pitch = 324,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 16,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 8,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 576,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 20736,
+        .coeff_buffer_size = 73728,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 128,
+        .n_tiles = 1,
+        .n_tile_size_last = 128,
+        .height_tiles = 1,
+        .output_rows = 8,
+        .input_rows = 16,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 4,
+        .layer_name = "conv_3x3_s1_ic128_oc128",
+        .kernel_name = "3x3j1d1_no_dma",
+        .config_key = "128_8_8_128_3_3_8_8_1_1_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 8,
+        .in_dim2_pitch = 100,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 8,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 8,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12800,
+        .coeff_buffer_size = 147456,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 128,
+        .n_tiles = 1,
+        .n_tile_size_last = 128,
+        .height_tiles = 1,
+        .output_rows = 8,
+        .input_rows = 8,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 5,
+        .layer_name = "conv_1x1_s2_ic128_oc256",
+        .kernel_name = "1x1j2d1_no_dma",
+        .config_key = "128_8_8_256_1_1_4_4_2_2_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 8,
+        .in_dim2_pitch = 64,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 8,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 128,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 4096,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 1,
+        .output_rows = 4,
+        .input_rows = 8,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 6,
+        .layer_name = "conv_3x3_s2_ic128_oc256",
+        .kernel_name = "3x3j2d1_no_dma",
+        .config_key = "128_8_8_256_3_3_4_4_2_2_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 8,
+        .in_dim2_pitch = 100,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 8,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12800,
+        .coeff_buffer_size = 294912,
+        .output_buffer_size = 4096,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 1,
+        .output_rows = 4,
+        .input_rows = 8,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 7,
+        .layer_name = "conv_3x3_s1_ic256_oc256",
+        .kernel_name = "3x3j1d1_no_dma",
+        .config_key = "256_4_4_256_3_3_4_4_1_1_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 36,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 9216,
+        .coeff_buffer_size = 589824,
+        .output_buffer_size = 4096,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 1,
+        .output_rows = 4,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 8,
+        .layer_name = "conv_1x1_s2_ic256_oc512",
+        .kernel_name = "1x1j2d1_no_dma",
+        .config_key = "256_4_4_512_1_1_2_2_2_2_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 512,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 4096,
+        .coeff_buffer_size = 131072,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 512,
+        .n_tiles = 1,
+        .n_tile_size_last = 512,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 9,
+        .layer_name = "conv_3x3_s2_ic256_oc512",
+        .kernel_name = "3x3j2d1_no_dma",
+        .config_key = "256_4_4_512_3_3_2_2_2_2_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 36,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 512,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 9216,
+        .coeff_buffer_size = 1179648,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 512,
+        .n_tiles = 1,
+        .n_tile_size_last = 512,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 10,
+        .layer_name = "conv_3x3_s1_ic512_oc512",
+        .kernel_name = "3x3j1d1_no_dma",
+        .config_key = "512_2_2_512_3_3_2_2_1_1_1_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 512,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 4608,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 2359296,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 512,
+        .n_tiles = 1,
+        .n_tile_size_last = 512,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 11,
+        .layer_name = "conv_1x1_s1_ic64_oc256",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "64_16_16_256_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 32,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 32,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 1024,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 16,
+        .n_tile_size_last = 16,
+        .height_tiles = 8,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 12,
+        .layer_name = "conv_1x1_s1_ic64_oc64",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "64_16_16_64_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 32,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 32,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 1024,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 4,
+        .n_tile_size_last = 16,
+        .height_tiles = 8,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 13,
+        .layer_name = "conv_1x1_s1_ic256_oc64",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "256_16_16_64_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 16,
+        .in_dim2_pitch = 256,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 16,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 16,
+        .out_dim2_pitch = 256,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 65536,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 16384,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 1,
+        .output_rows = 16,
+        .input_rows = 16,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 14,
+        .layer_name = "conv_1x1_s2_ic256_oc512",
+        .kernel_name = "1x1j2d1_no_dma",
+        .config_key = "256_16_16_512_1_1_8_8_2_2_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 16,
+        .in_dim2_pitch = 256,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 16,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 8,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 512,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 65536,
+        .coeff_buffer_size = 131072,
+        .output_buffer_size = 32768,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 512,
+        .n_tiles = 1,
+        .n_tile_size_last = 512,
+        .height_tiles = 1,
+        .output_rows = 8,
+        .input_rows = 16,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 15,
+        .layer_name = "conv_1x1_s1_ic256_oc128",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "256_16_16_128_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 16,
+        .in_dim2_pitch = 256,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 16,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 16,
+        .out_dim2_pitch = 256,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 65536,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 32768,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 128,
+        .n_tiles = 1,
+        .n_tile_size_last = 128,
+        .height_tiles = 1,
+        .output_rows = 16,
+        .input_rows = 16,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 16,
+        .layer_name = "conv_3x3_s2_ic128_oc128",
+        .kernel_name = "3x3j2d1_no_dma",
+        .config_key = "128_16_16_128_3_3_8_8_2_2_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 16,
+        .in_dim2_pitch = 324,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 16,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 8,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 41472,
+        .coeff_buffer_size = 147456,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 128,
+        .n_tiles = 1,
+        .n_tile_size_last = 128,
+        .height_tiles = 1,
+        .output_rows = 8,
+        .input_rows = 16,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 17,
+        .layer_name = "conv_1x1_s1_ic128_oc512",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "128_8_8_512_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 4,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 128,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 512,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 4,
+        .n_tiles = 128,
+        .n_tile_size_last = 4,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 18,
+        .layer_name = "conv_1x1_s1_ic512_oc128",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "512_8_8_128_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 8,
+        .in_dim2_pitch = 64,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 8,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 8,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 32768,
+        .coeff_buffer_size = 65536,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 128,
+        .n_tiles = 1,
+        .n_tile_size_last = 128,
+        .height_tiles = 1,
+        .output_rows = 8,
+        .input_rows = 8,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 19,
+        .layer_name = "conv_1x1_s2_ic512_oc1024",
+        .kernel_name = "1x1j2d1_no_dma",
+        .config_key = "512_8_8_1024_1_1_4_4_2_2_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 1024,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 8,
+        .in_dim2_pitch = 64,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 8,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 1024,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 1024,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 1024,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 1024,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 32768,
+        .coeff_buffer_size = 524288,
+        .output_buffer_size = 16384,
+        .bias_buffer_size = 4096,
+        .outscale_buffer_size = 2048,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 1024,
+        .n_tiles = 1,
+        .n_tile_size_last = 1024,
+        .height_tiles = 1,
+        .output_rows = 4,
+        .input_rows = 8,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 20,
+        .layer_name = "conv_1x1_s1_ic512_oc256",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "512_8_8_256_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 8,
+        .in_dim2_pitch = 64,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 8,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 8,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 32768,
+        .coeff_buffer_size = 131072,
+        .output_buffer_size = 16384,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 1,
+        .output_rows = 8,
+        .input_rows = 8,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 21,
+        .layer_name = "conv_3x3_s2_ic256_oc256",
+        .kernel_name = "3x3j2d1_no_dma",
+        .config_key = "256_8_8_256_3_3_4_4_2_2_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 8,
+        .in_dim2_pitch = 100,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 8,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 25600,
+        .coeff_buffer_size = 589824,
+        .output_buffer_size = 4096,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 1,
+        .output_rows = 4,
+        .input_rows = 8,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 22,
+        .layer_name = "conv_1x1_s1_ic256_oc1024",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "256_4_4_1024_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 1024,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 1024,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 1024,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 1024,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 1024,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 4096,
+        .coeff_buffer_size = 262144,
+        .output_buffer_size = 16384,
+        .bias_buffer_size = 4096,
+        .outscale_buffer_size = 2048,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 1024,
+        .n_tiles = 1,
+        .n_tile_size_last = 1024,
+        .height_tiles = 1,
+        .output_rows = 4,
+        .input_rows = 4,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 23,
+        .layer_name = "conv_1x1_s1_ic1024_oc256",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "1024_4_4_256_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 16384,
+        .coeff_buffer_size = 262144,
+        .output_buffer_size = 4096,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 1,
+        .output_rows = 4,
+        .input_rows = 4,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 24,
+        .layer_name = "conv_1x1_s2_ic1024_oc2048",
+        .kernel_name = "1x1j2d1_no_dma",
+        .config_key = "1024_4_4_2048_1_1_2_2_2_2_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 2048,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 2048,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 2048,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 2048,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 2048,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 16384,
+        .coeff_buffer_size = 2097152,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 8192,
+        .outscale_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 2048,
+        .n_tiles = 1,
+        .n_tile_size_last = 2048,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 25,
+        .layer_name = "conv_1x1_s1_ic1024_oc512",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "1024_4_4_512_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 512,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 16384,
+        .coeff_buffer_size = 524288,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 512,
+        .n_tiles = 1,
+        .n_tile_size_last = 512,
+        .height_tiles = 1,
+        .output_rows = 4,
+        .input_rows = 4,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 26,
+        .layer_name = "conv_3x3_s2_ic512_oc512",
+        .kernel_name = "3x3j2d1_no_dma",
+        .config_key = "512_4_4_512_3_3_2_2_2_2_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 36,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 512,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 4608,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 18432,
+        .coeff_buffer_size = 2359296,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 512,
+        .n_tiles = 1,
+        .n_tile_size_last = 512,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 27,
+        .layer_name = "conv_1x1_s1_ic512_oc2048",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "512_2_2_2048_1_1_2_2_1_1_0_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 2048,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 2,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 4,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 2048,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 2048,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 2048,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 2048,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 1048576,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 8192,
+        .outscale_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 2048,
+        .n_tiles = 1,
+        .n_tile_size_last = 2048,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 28,
+        .layer_name = "conv_1x1_s1_ic2048_oc512",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "2048_2_2_512_1_1_2_2_1_1_0_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 2048,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 2,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 4,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 512,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 2048,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 2048,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 1048576,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 512,
+        .n_tiles = 1,
+        .n_tile_size_last = 512,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+};
+
+static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; }
+
+static inline const conv_layer_config_t* get_conv_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL;
+    return &CONV_LAYER_CONFIGS[layer_id];
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_params(
+    int ic, int ih, int iw,
+    int oc, int kh, int kw,
+    int oh, int ow,
+    int sy, int sx,
+    int pad, int dil)
+{
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->src_dim3_size == ic &&
+            cfg->src_dim2_size == ih &&
+            cfg->src_dim1_size == iw &&
+            cfg->dst_dim3_size == oc &&
+            cfg->coeff_dim2_size == kh &&
+            cfg->coeff_dim1_size == kw &&
+            cfg->dst_dim2_size == oh &&
+            cfg->dst_dim1_size == ow &&
+            cfg->stride_y == sy &&
+            cfg->stride_x == sx &&
+            cfg->padding == pad &&
+            cfg->dilation == dil)
+            return cfg;
+    }
+    return NULL;
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) {
+    if (config_key == NULL) return NULL;
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->config_key != NULL) {
+            const char* a = config_key;
+            const char* b = cfg->config_key;
+            while (*a && *b && *a == *b) { a++; b++; }
+            if (*a == '\0' && *b == '\0') return cfg;
+        }
+    }
+    return NULL;
+}
+
+/* ====================================================================== */
+/*  MaxPool configurations                                             */
+/* ====================================================================== */
+
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* config_key;
+
+    int src_width;  int src_height;  int channels;
+    int dst_width;  int dst_height;
+
+    int src_row_pitch;  int src_plane_pitch;
+    int dst_row_pitch;  int dst_plane_pitch;
+
+    int kernel_h;  int kernel_w;
+    int stride_h;  int stride_w;
+    int pad_h;     int pad_w;
+
+    int in_tile_w;      int in_tile_rows;   int in_tile_plane;
+    int in_data_offset;
+    int out_tile_w;     int out_tile_rows;  int out_tile_plane;
+
+    int c_tile_size;  int c_tiles;  int c_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int input_buffer_size;  int output_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;
+    int output_ping_dram; int output_pong_dram;
+} maxpool_layer_config_t;
+
+#define NUM_MAXPOOL_LAYERS 1
+
+static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = {
+    {
+        .layer_id = 0,
+        .layer_name = "maxpool_3x3s2_c64_32x32",
+        .config_key = "64_32_32_3_3_2_2_1_1",
+        .src_width = 32,
+        .src_height = 32,
+        .channels = 64,
+        .dst_width = 16,
+        .dst_height = 16,
+        .src_row_pitch = 32,
+        .src_plane_pitch = 1024,
+        .dst_row_pitch = 16,
+        .dst_plane_pitch = 256,
+        .kernel_h = 3,
+        .kernel_w = 3,
+        .stride_h = 2,
+        .stride_w = 2,
+        .pad_h = 1,
+        .pad_w = 1,
+        .in_tile_w = 34,
+        .in_tile_rows = 5,
+        .in_tile_plane = 170,
+        .in_data_offset = 35,
+        .out_tile_w = 16,
+        .out_tile_rows = 1,
+        .out_tile_plane = 16,
+        .c_tile_size = 5,
+        .c_tiles = 13,
+        .c_tile_size_last = 4,
+        .height_tiles = 16,
+        .output_rows = 1,
+        .input_rows = 3,
+        .input_buffer_size = 3400,
+        .output_buffer_size = 320,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 0,
+    },
+};
+
+static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; }
+
+static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL;
+    return &MAXPOOL_LAYER_CONFIGS[layer_id];
+}
+
+static inline const maxpool_layer_config_t* get_maxpool_config_by_params(
+    int channels, int src_height, int src_width,
+    int kernel_h, int kernel_w,
+    int stride_h, int stride_w,
+    int pad_h, int pad_w)
+{
+    for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) {
+        const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i];
+        if (c->channels   == channels   &&
+            c->src_height == src_height &&
+            c->src_width  == src_width  &&
+            c->kernel_h   == kernel_h   &&
+            c->kernel_w   == kernel_w   &&
+            c->stride_h   == stride_h   &&
+            c->stride_w   == stride_w   &&
+            c->pad_h      == pad_h      &&
+            c->pad_w      == pad_w)
+            return c;
+    }
+    return NULL;
+}
+
+#endif /* LAYER_CONFIGS_H */
diff --git a/backends/cadence/vision/config_generator/layer_configs_61k.h b/backends/cadence/vision/config_generator/layer_configs_61k.h
new file mode 100644
index 00000000000..3f47d4533eb
--- /dev/null
+++ b/backends/cadence/vision/config_generator/layer_configs_61k.h
@@ -0,0 +1,2403 @@
+/*
+ * layer_configs.h
+ *
+ * Auto-generated conv2d + maxpool layer configurations
+ * Generated from PTE extraction by generate_combined_configs.py
+ *
+ * DO NOT EDIT MANUALLY
+ */
+
+#ifndef LAYER_CONFIGS_H
+#define LAYER_CONFIGS_H
+
+#include <stdint.h>
+#include <stddef.h>  /* for NULL */
+
+#define IDMA_BUFFER_SIZE_DRAM0 (62976)  /* 61 KB */
+#define IDMA_BUFFER_SIZE_DRAM1 (62976)  /* 61 KB */
+
+/* ====================================================================== */
+/*  Conv2d configurations                                              */
+/* ====================================================================== */
+
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* kernel_name;
+    const char* config_key;
+
+    int src_dim1_size;  int src_dim2_size;  int src_dim3_size;
+    int src_dim1_pitch; int src_dim2_pitch;
+
+    int dst_dim1_size;  int dst_dim2_size;  int dst_dim3_size;
+    int dst_dim1_pitch; int dst_dim2_pitch;
+
+    int in_dim1_size;   int in_dim1_pitch;
+    int in_dim2_size;   int in_dim2_pitch;
+    int in_dim1_edge1;  int in_dim1_edge2;
+    int in_dim2_edge1;  int in_dim2_edge2;
+    int in_dim3_edge1;  int in_dim3_edge2;
+    int in_data_offset; int in_rows_firstdma;
+
+    int out_dim1_size;  int out_dim1_pitch;
+    int out_dim2_size;  int out_dim2_pitch;
+    int out_dim3_size;
+
+    int coeff_dim1_size;  int coeff_dim2_size;
+    int coeff_dim3_size;  int coeff_dim4_size;
+    int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch;
+
+    int bias_dim1_size;     int bias_dim2_size;
+    int outscale_dim1_size; int outscale_dim2_size;
+
+    int input_buffer_size;  int coeff_buffer_size;  int output_buffer_size;
+    int bias_buffer_size;   int outscale_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;  int coeff_dram;
+    int output_ping_dram; int output_pong_dram;
+    int bias_dram;        int outscale_dram;
+
+    int n_tile_size; int n_tiles; int n_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int kernel_w; int kernel_h;
+    int stride_x; int stride_y;
+    int padding;  int dilation;
+    int accum_shift; int relu_max; int relu_min;
+    int output_shift; int output_scale; int flags;
+    int input_zero_point;
+} conv_layer_config_t;
+
+#define NUM_CONV_LAYERS 29
+
+static const conv_layer_config_t CONV_LAYER_CONFIGS[] = {
+    {
+        .layer_id = 0,
+        .layer_name = "conv_7x7_s2_ic3_oc64",
+        .kernel_name = "7x7j2d1_dma",
+        .config_key = "3_64_64_64_7_7_32_32_2_2_3_1",
+        .src_dim1_size = 64,
+        .src_dim2_size = 64,
+        .src_dim3_size = 3,
+        .src_dim1_pitch = 64,
+        .src_dim2_pitch = 4096,
+        .dst_dim1_size = 32,
+        .dst_dim2_size = 32,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 32,
+        .dst_dim2_pitch = 1024,
+        .in_dim1_size = 64,
+        .in_dim1_pitch = 70,
+        .in_dim2_size = 35,
+        .in_dim2_pitch = 2450,
+        .in_dim1_edge1 = 3,
+        .in_dim1_edge2 = 3,
+        .in_dim2_edge1 = 3,
+        .in_dim2_edge2 = 3,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 213,
+        .in_rows_firstdma = 32,
+        .out_dim1_size = 32,
+        .out_dim1_pitch = 32,
+        .out_dim2_size = 15,
+        .out_dim2_pitch = 480,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 7,
+        .coeff_dim2_size = 7,
+        .coeff_dim3_size = 3,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 7,
+        .coeff_dim2_pitch = 49,
+        .coeff_dim3_pitch = 147,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 7350,
+        .coeff_buffer_size = 9408,
+        .output_buffer_size = 30720,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 3,
+        .output_rows = 15,
+        .input_rows = 35,
+        .kernel_w = 7,
+        .kernel_h = 7,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 3,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 1,
+        .layer_name = "conv_3x3_s1_ic64_oc64",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "64_16_16_64_3_3_16_16_1_1_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 18,
+        .in_dim2_pitch = 324,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 17,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 16,
+        .out_dim2_pitch = 256,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 576,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 20736,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 16384,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 1,
+        .output_rows = 16,
+        .input_rows = 18,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 2,
+        .layer_name = "conv_1x1_s2_ic64_oc128",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "64_16_16_128_1_1_8_8_2_2_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 15,
+        .in_dim2_pitch = 240,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 15,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 8,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 15360,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 1,
+        .n_tile_size_last = 128,
+        .height_tiles = 1,
+        .output_rows = 8,
+        .input_rows = 15,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 3,
+        .layer_name = "conv_3x3_s2_ic64_oc128",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "64_16_16_128_3_3_8_8_2_2_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 90,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 576,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 5760,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 2,
+        .n_tile_size_last = 64,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 4,
+        .layer_name = "conv_3x3_s1_ic128_oc128",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "128_8_8_128_3_3_8_8_1_1_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 40,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 5120,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 4,
+        .n_tile_size_last = 32,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 5,
+        .layer_name = "conv_1x1_s2_ic128_oc256",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "128_8_8_256_1_1_4_4_2_2_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 7,
+        .in_dim2_pitch = 56,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 7,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 128,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 7168,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 4096,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 1,
+        .output_rows = 4,
+        .input_rows = 7,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 6,
+        .layer_name = "conv_3x3_s2_ic128_oc256",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "128_8_8_256_3_3_4_4_2_2_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 50,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 6400,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 8,
+        .n_tile_size_last = 32,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 7,
+        .layer_name = "conv_3x3_s1_ic256_oc256",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "256_4_4_256_3_3_4_4_1_1_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 24,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 7,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 6144,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 16,
+        .n_tile_size_last = 16,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 8,
+        .layer_name = "conv_1x1_s2_ic256_oc512",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "256_4_4_512_1_1_2_2_2_2_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 12,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 3072,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 4,
+        .n_tile_size_last = 128,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 9,
+        .layer_name = "conv_3x3_s2_ic256_oc512",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "256_4_4_512_3_3_2_2_2_2_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 30,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 7,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 7680,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 32,
+        .n_tile_size_last = 16,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 10,
+        .layer_name = "conv_3x3_s1_ic512_oc512",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "512_2_2_512_3_3_2_2_1_1_1_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 5,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 4608,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 32,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 64,
+        .n_tile_size_last = 8,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 11,
+        .layer_name = "conv_1x1_s1_ic64_oc256",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "64_16_16_256_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 7,
+        .in_dim2_pitch = 112,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 7,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 7,
+        .out_dim2_pitch = 112,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 7168,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 28672,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 3,
+        .output_rows = 7,
+        .input_rows = 7,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 12,
+        .layer_name = "conv_1x1_s1_ic64_oc64",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "64_16_16_64_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 16,
+        .in_dim2_pitch = 256,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 16,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 16,
+        .out_dim2_pitch = 256,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 16384,
+        .coeff_buffer_size = 4096,
+        .output_buffer_size = 16384,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 1,
+        .output_rows = 16,
+        .input_rows = 16,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 13,
+        .layer_name = "conv_1x1_s1_ic256_oc64",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "256_16_16_64_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 10,
+        .in_dim2_pitch = 160,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 10,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 10,
+        .out_dim2_pitch = 160,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 40960,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 10240,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 2,
+        .output_rows = 10,
+        .input_rows = 10,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 14,
+        .layer_name = "conv_1x1_s2_ic256_oc512",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "256_16_16_512_1_1_8_8_2_2_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 48,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12288,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 4,
+        .n_tile_size_last = 128,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 15,
+        .layer_name = "conv_1x1_s1_ic256_oc128",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "256_16_16_128_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 7,
+        .in_dim2_pitch = 112,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 7,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 7,
+        .out_dim2_pitch = 112,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 28672,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 14336,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 1,
+        .n_tile_size_last = 128,
+        .height_tiles = 3,
+        .output_rows = 7,
+        .input_rows = 7,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 16,
+        .layer_name = "conv_3x3_s2_ic128_oc128",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "128_16_16_128_3_3_8_8_2_2_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 90,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 11520,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 4,
+        .n_tile_size_last = 32,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 17,
+        .layer_name = "conv_1x1_s1_ic128_oc512",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "128_8_8_512_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 128,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 4096,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 256,
+        .n_tiles = 2,
+        .n_tile_size_last = 256,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 18,
+        .layer_name = "conv_1x1_s1_ic512_oc128",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "512_8_8_128_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 2,
+        .n_tile_size_last = 64,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 19,
+        .layer_name = "conv_1x1_s2_ic512_oc1024",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "512_8_8_1024_1_1_4_4_2_2_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 1024,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 24,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 1024,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 1024,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 1024,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12288,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 4096,
+        .outscale_buffer_size = 2048,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 16,
+        .n_tile_size_last = 64,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 20,
+        .layer_name = "conv_1x1_s1_ic512_oc256",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "512_8_8_256_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 4,
+        .n_tile_size_last = 64,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 21,
+        .layer_name = "conv_3x3_s2_ic256_oc256",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "256_8_8_256_3_3_4_4_2_2_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 50,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12800,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 16,
+        .n_tile_size_last = 16,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 22,
+        .layer_name = "conv_1x1_s1_ic256_oc1024",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "256_4_4_1024_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 1024,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 1024,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 1024,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 1024,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 4096,
+        .outscale_buffer_size = 2048,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 8,
+        .n_tile_size_last = 128,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 23,
+        .layer_name = "conv_1x1_s1_ic1024_oc256",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "1024_4_4_256_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 8,
+        .n_tile_size_last = 32,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 24,
+        .layer_name = "conv_1x1_s2_ic1024_oc2048",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "1024_4_4_2048_1_1_2_2_2_2_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 2048,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 12,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 2048,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 2048,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 2048,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12288,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 8192,
+        .outscale_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 64,
+        .n_tile_size_last = 32,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 25,
+        .layer_name = "conv_1x1_s1_ic1024_oc512",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "1024_4_4_512_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 16,
+        .n_tile_size_last = 32,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 26,
+        .layer_name = "conv_3x3_s2_ic512_oc512",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "512_4_4_512_3_3_2_2_2_2_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 30,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 7,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 4608,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 15360,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 32,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 64,
+        .n_tile_size_last = 8,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 27,
+        .layer_name = "conv_1x1_s1_ic512_oc2048",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "512_2_2_2048_1_1_2_2_1_1_0_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 2048,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 2,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 4,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 2048,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 2048,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 2048,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 8192,
+        .outscale_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 32,
+        .n_tile_size_last = 64,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 28,
+        .layer_name = "conv_1x1_s1_ic2048_oc512",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "2048_2_2_512_1_1_2_2_1_1_0_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 2048,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 2,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 4,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 2048,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 2048,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 32,
+        .n_tile_size_last = 16,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+};
+
+static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; }
+
+static inline const conv_layer_config_t* get_conv_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL;
+    return &CONV_LAYER_CONFIGS[layer_id];
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_params(
+    int ic, int ih, int iw,
+    int oc, int kh, int kw,
+    int oh, int ow,
+    int sy, int sx,
+    int pad, int dil)
+{
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->src_dim3_size == ic &&
+            cfg->src_dim2_size == ih &&
+            cfg->src_dim1_size == iw &&
+            cfg->dst_dim3_size == oc &&
+            cfg->coeff_dim2_size == kh &&
+            cfg->coeff_dim1_size == kw &&
+            cfg->dst_dim2_size == oh &&
+            cfg->dst_dim1_size == ow &&
+            cfg->stride_y == sy &&
+            cfg->stride_x == sx &&
+            cfg->padding == pad &&
+            cfg->dilation == dil)
+            return cfg;
+    }
+    return NULL;
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) {
+    if (config_key == NULL) return NULL;
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->config_key != NULL) {
+            const char* a = config_key;
+            const char* b = cfg->config_key;
+            while (*a && *b && *a == *b) { a++; b++; }
+            if (*a == '\0' && *b == '\0') return cfg;
+        }
+    }
+    return NULL;
+}
+
+/* ====================================================================== */
+/*  MaxPool configurations                                             */
+/* ====================================================================== */
+
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* config_key;
+
+    int src_width;  int src_height;  int channels;
+    int dst_width;  int dst_height;
+
+    int src_row_pitch;  int src_plane_pitch;
+    int dst_row_pitch;  int dst_plane_pitch;
+
+    int kernel_h;  int kernel_w;
+    int stride_h;  int stride_w;
+    int pad_h;     int pad_w;
+
+    int in_tile_w;      int in_tile_rows;   int in_tile_plane;
+    int in_data_offset;
+    int out_tile_w;     int out_tile_rows;  int out_tile_plane;
+
+    int c_tile_size;  int c_tiles;  int c_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int input_buffer_size;  int output_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;
+    int output_ping_dram; int output_pong_dram;
+} maxpool_layer_config_t;
+
+#define NUM_MAXPOOL_LAYERS 1
+
+static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = {
+    {
+        .layer_id = 0,
+        .layer_name = "maxpool_3x3s2_c64_32x32",
+        .config_key = "64_32_32_3_3_2_2_1_1",
+        .src_width = 32,
+        .src_height = 32,
+        .channels = 64,
+        .dst_width = 16,
+        .dst_height = 16,
+        .src_row_pitch = 32,
+        .src_plane_pitch = 1024,
+        .dst_row_pitch = 16,
+        .dst_plane_pitch = 256,
+        .kernel_h = 3,
+        .kernel_w = 3,
+        .stride_h = 2,
+        .stride_w = 2,
+        .pad_h = 1,
+        .pad_w = 1,
+        .in_tile_w = 34,
+        .in_tile_rows = 5,
+        .in_tile_plane = 170,
+        .in_data_offset = 35,
+        .out_tile_w = 16,
+        .out_tile_rows = 1,
+        .out_tile_plane = 16,
+        .c_tile_size = 64,
+        .c_tiles = 1,
+        .c_tile_size_last = 64,
+        .height_tiles = 16,
+        .output_rows = 1,
+        .input_rows = 3,
+        .input_buffer_size = 43520,
+        .output_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 0,
+    },
+};
+
+static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; }
+
+static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL;
+    return &MAXPOOL_LAYER_CONFIGS[layer_id];
+}
+
+static inline const maxpool_layer_config_t* get_maxpool_config_by_params(
+    int channels, int src_height, int src_width,
+    int kernel_h, int kernel_w,
+    int stride_h, int stride_w,
+    int pad_h, int pad_w)
+{
+    for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) {
+        const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i];
+        if (c->channels   == channels   &&
+            c->src_height == src_height &&
+            c->src_width  == src_width  &&
+            c->kernel_h   == kernel_h   &&
+            c->kernel_w   == kernel_w   &&
+            c->stride_h   == stride_h   &&
+            c->stride_w   == stride_w   &&
+            c->pad_h      == pad_h      &&
+            c->pad_w      == pad_w)
+            return c;
+    }
+    return NULL;
+}
+
+#endif /* LAYER_CONFIGS_H */
diff --git a/backends/cadence/vision/config_generator/layer_configs_8k.h b/backends/cadence/vision/config_generator/layer_configs_8k.h
new file mode 100644
index 00000000000..de165698252
--- /dev/null
+++ b/backends/cadence/vision/config_generator/layer_configs_8k.h
@@ -0,0 +1,2403 @@
+/*
+ * layer_configs.h
+ *
+ * Auto-generated conv2d + maxpool layer configurations
+ * Generated from PTE extraction by generate_combined_configs.py
+ *
+ * DO NOT EDIT MANUALLY
+ */
+
+#ifndef LAYER_CONFIGS_H
+#define LAYER_CONFIGS_H
+
+#include <stdint.h>
+#include <stddef.h>  /* for NULL */
+
+#define IDMA_BUFFER_SIZE_DRAM0 (8192)  /* 8 KB */
+#define IDMA_BUFFER_SIZE_DRAM1 (8192)  /* 8 KB */
+
+/* ====================================================================== */
+/*  Conv2d configurations                                              */
+/* ====================================================================== */
+
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* kernel_name;
+    const char* config_key;
+
+    int src_dim1_size;  int src_dim2_size;  int src_dim3_size;
+    int src_dim1_pitch; int src_dim2_pitch;
+
+    int dst_dim1_size;  int dst_dim2_size;  int dst_dim3_size;
+    int dst_dim1_pitch; int dst_dim2_pitch;
+
+    int in_dim1_size;   int in_dim1_pitch;
+    int in_dim2_size;   int in_dim2_pitch;
+    int in_dim1_edge1;  int in_dim1_edge2;
+    int in_dim2_edge1;  int in_dim2_edge2;
+    int in_dim3_edge1;  int in_dim3_edge2;
+    int in_data_offset; int in_rows_firstdma;
+
+    int out_dim1_size;  int out_dim1_pitch;
+    int out_dim2_size;  int out_dim2_pitch;
+    int out_dim3_size;
+
+    int coeff_dim1_size;  int coeff_dim2_size;
+    int coeff_dim3_size;  int coeff_dim4_size;
+    int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch;
+
+    int bias_dim1_size;     int bias_dim2_size;
+    int outscale_dim1_size; int outscale_dim2_size;
+
+    int input_buffer_size;  int coeff_buffer_size;  int output_buffer_size;
+    int bias_buffer_size;   int outscale_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;  int coeff_dram;
+    int output_ping_dram; int output_pong_dram;
+    int bias_dram;        int outscale_dram;
+
+    int n_tile_size; int n_tiles; int n_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int kernel_w; int kernel_h;
+    int stride_x; int stride_y;
+    int padding;  int dilation;
+    int accum_shift; int relu_max; int relu_min;
+    int output_shift; int output_scale; int flags;
+    int input_zero_point;
+} conv_layer_config_t;
+
+#define NUM_CONV_LAYERS 29
+
+static const conv_layer_config_t CONV_LAYER_CONFIGS[] = {
+    {
+        .layer_id = 0,
+        .layer_name = "conv_7x7_s2_ic3_oc64",
+        .kernel_name = "7x7j2d1_dma",
+        .config_key = "3_64_64_64_7_7_32_32_2_2_3_1",
+        .src_dim1_size = 64,
+        .src_dim2_size = 64,
+        .src_dim3_size = 3,
+        .src_dim1_pitch = 64,
+        .src_dim2_pitch = 4096,
+        .dst_dim1_size = 32,
+        .dst_dim2_size = 32,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 32,
+        .dst_dim2_pitch = 1024,
+        .in_dim1_size = 64,
+        .in_dim1_pitch = 70,
+        .in_dim2_size = 9,
+        .in_dim2_pitch = 630,
+        .in_dim1_edge1 = 3,
+        .in_dim1_edge2 = 3,
+        .in_dim2_edge1 = 3,
+        .in_dim2_edge2 = 3,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 213,
+        .in_rows_firstdma = 6,
+        .out_dim1_size = 32,
+        .out_dim1_pitch = 32,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 7,
+        .coeff_dim2_size = 7,
+        .coeff_dim3_size = 3,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 7,
+        .coeff_dim2_pitch = 49,
+        .coeff_dim3_pitch = 147,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 1890,
+        .coeff_buffer_size = 4704,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 2,
+        .n_tile_size_last = 32,
+        .height_tiles = 16,
+        .output_rows = 2,
+        .input_rows = 9,
+        .kernel_w = 7,
+        .kernel_h = 7,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 3,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 1,
+        .layer_name = "conv_3x3_s1_ic64_oc64",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "64_16_16_64_3_3_16_16_1_1_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 72,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 32,
+        .out_dim3_size = 4,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 576,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 4608,
+        .coeff_buffer_size = 2304,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 4,
+        .n_tiles = 16,
+        .n_tile_size_last = 4,
+        .height_tiles = 8,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 2,
+        .layer_name = "conv_1x1_s2_ic64_oc128",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "64_16_16_128_1_1_8_8_2_2_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 48,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 3072,
+        .coeff_buffer_size = 4096,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 2,
+        .n_tile_size_last = 64,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 3,
+        .layer_name = "conv_3x3_s2_ic64_oc128",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "64_16_16_128_3_3_8_8_2_2_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 90,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 4,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 576,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 5760,
+        .coeff_buffer_size = 2304,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 4,
+        .n_tiles = 32,
+        .n_tile_size_last = 4,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 4,
+        .layer_name = "conv_3x3_s1_ic128_oc128",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "128_8_8_128_3_3_8_8_1_1_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 40,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 2,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 5120,
+        .coeff_buffer_size = 2304,
+        .output_buffer_size = 32,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 2,
+        .n_tiles = 64,
+        .n_tile_size_last = 2,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 5,
+        .layer_name = "conv_1x1_s2_ic128_oc256",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "128_8_8_256_1_1_4_4_2_2_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 24,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 128,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 3072,
+        .coeff_buffer_size = 4096,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 8,
+        .n_tile_size_last = 32,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 6,
+        .layer_name = "conv_3x3_s2_ic128_oc256",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "128_8_8_256_3_3_4_4_2_2_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 50,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 1,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 6400,
+        .coeff_buffer_size = 1152,
+        .output_buffer_size = 8,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 1,
+        .n_tiles = 256,
+        .n_tile_size_last = 1,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 7,
+        .layer_name = "conv_3x3_s1_ic256_oc256",
+        .kernel_name = "3x3j1d1_no_dma",
+        .config_key = "256_4_4_256_3_3_4_4_1_1_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 36,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 9216,
+        .coeff_buffer_size = 589824,
+        .output_buffer_size = 4096,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 1,
+        .output_rows = 4,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 8,
+        .layer_name = "conv_1x1_s2_ic256_oc512",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "256_4_4_512_1_1_2_2_2_2_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 12,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 3072,
+        .coeff_buffer_size = 4096,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 32,
+        .n_tile_size_last = 16,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 9,
+        .layer_name = "conv_3x3_s2_ic256_oc512",
+        .kernel_name = "3x3j2d1_no_dma",
+        .config_key = "256_4_4_512_3_3_2_2_2_2_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 36,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 512,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 9216,
+        .coeff_buffer_size = 1179648,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 512,
+        .n_tiles = 1,
+        .n_tile_size_last = 512,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 10,
+        .layer_name = "conv_3x3_s1_ic512_oc512",
+        .kernel_name = "3x3j1d1_no_dma",
+        .config_key = "512_2_2_512_3_3_2_2_1_1_1_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 512,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 4608,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 2359296,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 512,
+        .n_tiles = 1,
+        .n_tile_size_last = 512,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 11,
+        .layer_name = "conv_1x1_s1_ic64_oc256",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "64_16_16_256_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 32,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 32,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 4096,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 4,
+        .n_tile_size_last = 64,
+        .height_tiles = 8,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 12,
+        .layer_name = "conv_1x1_s1_ic64_oc64",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "64_16_16_64_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 32,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 32,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 4096,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 8,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 13,
+        .layer_name = "conv_1x1_s1_ic256_oc64",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "256_16_16_64_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 16,
+        .in_dim2_pitch = 256,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 16,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 16,
+        .out_dim2_pitch = 256,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 65536,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 16384,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 1,
+        .output_rows = 16,
+        .input_rows = 16,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 14,
+        .layer_name = "conv_1x1_s2_ic256_oc512",
+        .kernel_name = "1x1j2d1_no_dma",
+        .config_key = "256_16_16_512_1_1_8_8_2_2_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 16,
+        .in_dim2_pitch = 256,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 16,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 8,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 512,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 65536,
+        .coeff_buffer_size = 131072,
+        .output_buffer_size = 32768,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 512,
+        .n_tiles = 1,
+        .n_tile_size_last = 512,
+        .height_tiles = 1,
+        .output_rows = 8,
+        .input_rows = 16,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 15,
+        .layer_name = "conv_1x1_s1_ic256_oc128",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "256_16_16_128_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 16,
+        .in_dim2_pitch = 256,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 16,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 16,
+        .out_dim2_pitch = 256,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 65536,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 32768,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 128,
+        .n_tiles = 1,
+        .n_tile_size_last = 128,
+        .height_tiles = 1,
+        .output_rows = 16,
+        .input_rows = 16,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 16,
+        .layer_name = "conv_3x3_s2_ic128_oc128",
+        .kernel_name = "3x3j2d1_no_dma",
+        .config_key = "128_16_16_128_3_3_8_8_2_2_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 16,
+        .in_dim2_pitch = 324,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 16,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 8,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 41472,
+        .coeff_buffer_size = 147456,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 128,
+        .n_tiles = 1,
+        .n_tile_size_last = 128,
+        .height_tiles = 1,
+        .output_rows = 8,
+        .input_rows = 16,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 17,
+        .layer_name = "conv_1x1_s1_ic128_oc512",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "128_8_8_512_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 128,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 4096,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 16,
+        .n_tile_size_last = 32,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 18,
+        .layer_name = "conv_1x1_s1_ic512_oc128",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "512_8_8_128_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 8,
+        .in_dim2_pitch = 64,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 8,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 8,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 32768,
+        .coeff_buffer_size = 65536,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 128,
+        .n_tiles = 1,
+        .n_tile_size_last = 128,
+        .height_tiles = 1,
+        .output_rows = 8,
+        .input_rows = 8,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 19,
+        .layer_name = "conv_1x1_s2_ic512_oc1024",
+        .kernel_name = "1x1j2d1_no_dma",
+        .config_key = "512_8_8_1024_1_1_4_4_2_2_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 1024,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 8,
+        .in_dim2_pitch = 64,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 8,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 1024,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 1024,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 1024,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 1024,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 32768,
+        .coeff_buffer_size = 524288,
+        .output_buffer_size = 16384,
+        .bias_buffer_size = 4096,
+        .outscale_buffer_size = 2048,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 1024,
+        .n_tiles = 1,
+        .n_tile_size_last = 1024,
+        .height_tiles = 1,
+        .output_rows = 4,
+        .input_rows = 8,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 20,
+        .layer_name = "conv_1x1_s1_ic512_oc256",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "512_8_8_256_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 8,
+        .in_dim2_pitch = 64,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 8,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 8,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 32768,
+        .coeff_buffer_size = 131072,
+        .output_buffer_size = 16384,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 1,
+        .output_rows = 8,
+        .input_rows = 8,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 21,
+        .layer_name = "conv_3x3_s2_ic256_oc256",
+        .kernel_name = "3x3j2d1_no_dma",
+        .config_key = "256_8_8_256_3_3_4_4_2_2_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 8,
+        .in_dim2_pitch = 100,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 8,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 25600,
+        .coeff_buffer_size = 589824,
+        .output_buffer_size = 4096,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 1,
+        .output_rows = 4,
+        .input_rows = 8,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 22,
+        .layer_name = "conv_1x1_s1_ic256_oc1024",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "256_4_4_1024_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 1024,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 1024,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 1024,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 1024,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 4096,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 4096,
+        .outscale_buffer_size = 2048,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 64,
+        .n_tile_size_last = 16,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 23,
+        .layer_name = "conv_1x1_s1_ic1024_oc256",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "1024_4_4_256_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 16384,
+        .coeff_buffer_size = 262144,
+        .output_buffer_size = 4096,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 1,
+        .output_rows = 4,
+        .input_rows = 4,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 24,
+        .layer_name = "conv_1x1_s2_ic1024_oc2048",
+        .kernel_name = "1x1j2d1_no_dma",
+        .config_key = "1024_4_4_2048_1_1_2_2_2_2_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 2048,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 2048,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 2048,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 2048,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 2048,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 16384,
+        .coeff_buffer_size = 2097152,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 8192,
+        .outscale_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 2048,
+        .n_tiles = 1,
+        .n_tile_size_last = 2048,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 25,
+        .layer_name = "conv_1x1_s1_ic1024_oc512",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "1024_4_4_512_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 512,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 16384,
+        .coeff_buffer_size = 524288,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 512,
+        .n_tiles = 1,
+        .n_tile_size_last = 512,
+        .height_tiles = 1,
+        .output_rows = 4,
+        .input_rows = 4,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 26,
+        .layer_name = "conv_3x3_s2_ic512_oc512",
+        .kernel_name = "3x3j2d1_no_dma",
+        .config_key = "512_4_4_512_3_3_2_2_2_2_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 36,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 512,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 4608,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 18432,
+        .coeff_buffer_size = 2359296,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 512,
+        .n_tiles = 1,
+        .n_tile_size_last = 512,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 27,
+        .layer_name = "conv_1x1_s1_ic512_oc2048",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "512_2_2_2048_1_1_2_2_1_1_0_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 2048,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 2,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 4,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 2048,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 2048,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 2048,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 2048,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 1048576,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 8192,
+        .outscale_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 2048,
+        .n_tiles = 1,
+        .n_tile_size_last = 2048,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 28,
+        .layer_name = "conv_1x1_s1_ic2048_oc512",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "2048_2_2_512_1_1_2_2_1_1_0_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 2048,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 2,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 4,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 512,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 2048,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 2048,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 1048576,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 0,
+        .output_pong_dram = 0,
+        .bias_dram = 0,
+        .outscale_dram = 0,
+        .n_tile_size = 512,
+        .n_tiles = 1,
+        .n_tile_size_last = 512,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+};
+
+static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; }
+
+static inline const conv_layer_config_t* get_conv_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL;
+    return &CONV_LAYER_CONFIGS[layer_id];
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_params(
+    int ic, int ih, int iw,
+    int oc, int kh, int kw,
+    int oh, int ow,
+    int sy, int sx,
+    int pad, int dil)
+{
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->src_dim3_size == ic &&
+            cfg->src_dim2_size == ih &&
+            cfg->src_dim1_size == iw &&
+            cfg->dst_dim3_size == oc &&
+            cfg->coeff_dim2_size == kh &&
+            cfg->coeff_dim1_size == kw &&
+            cfg->dst_dim2_size == oh &&
+            cfg->dst_dim1_size == ow &&
+            cfg->stride_y == sy &&
+            cfg->stride_x == sx &&
+            cfg->padding == pad &&
+            cfg->dilation == dil)
+            return cfg;
+    }
+    return NULL;
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) {
+    if (config_key == NULL) return NULL;
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->config_key != NULL) {
+            const char* a = config_key;
+            const char* b = cfg->config_key;
+            while (*a && *b && *a == *b) { a++; b++; }
+            if (*a == '\0' && *b == '\0') return cfg;
+        }
+    }
+    return NULL;
+}
+
+/* ====================================================================== */
+/*  MaxPool configurations                                             */
+/* ====================================================================== */
+
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* config_key;
+
+    int src_width;  int src_height;  int channels;
+    int dst_width;  int dst_height;
+
+    int src_row_pitch;  int src_plane_pitch;
+    int dst_row_pitch;  int dst_plane_pitch;
+
+    int kernel_h;  int kernel_w;
+    int stride_h;  int stride_w;
+    int pad_h;     int pad_w;
+
+    int in_tile_w;      int in_tile_rows;   int in_tile_plane;
+    int in_data_offset;
+    int out_tile_w;     int out_tile_rows;  int out_tile_plane;
+
+    int c_tile_size;  int c_tiles;  int c_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int input_buffer_size;  int output_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;
+    int output_ping_dram; int output_pong_dram;
+} maxpool_layer_config_t;
+
+#define NUM_MAXPOOL_LAYERS 1
+
+static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = {
+    {
+        .layer_id = 0,
+        .layer_name = "maxpool_3x3s2_c64_32x32",
+        .config_key = "64_32_32_3_3_2_2_1_1",
+        .src_width = 32,
+        .src_height = 32,
+        .channels = 64,
+        .dst_width = 16,
+        .dst_height = 16,
+        .src_row_pitch = 32,
+        .src_plane_pitch = 1024,
+        .dst_row_pitch = 16,
+        .dst_plane_pitch = 256,
+        .kernel_h = 3,
+        .kernel_w = 3,
+        .stride_h = 2,
+        .stride_w = 2,
+        .pad_h = 1,
+        .pad_w = 1,
+        .in_tile_w = 34,
+        .in_tile_rows = 5,
+        .in_tile_plane = 170,
+        .in_data_offset = 35,
+        .out_tile_w = 16,
+        .out_tile_rows = 1,
+        .out_tile_plane = 16,
+        .c_tile_size = 11,
+        .c_tiles = 6,
+        .c_tile_size_last = 9,
+        .height_tiles = 16,
+        .output_rows = 1,
+        .input_rows = 3,
+        .input_buffer_size = 7480,
+        .output_buffer_size = 704,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 0,
+    },
+};
+
+static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; }
+
+static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL;
+    return &MAXPOOL_LAYER_CONFIGS[layer_id];
+}
+
+static inline const maxpool_layer_config_t* get_maxpool_config_by_params(
+    int channels, int src_height, int src_width,
+    int kernel_h, int kernel_w,
+    int stride_h, int stride_w,
+    int pad_h, int pad_w)
+{
+    for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) {
+        const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i];
+        if (c->channels   == channels   &&
+            c->src_height == src_height &&
+            c->src_width  == src_width  &&
+            c->kernel_h   == kernel_h   &&
+            c->kernel_w   == kernel_w   &&
+            c->stride_h   == stride_h   &&
+            c->stride_w   == stride_w   &&
+            c->pad_h      == pad_h      &&
+            c->pad_w      == pad_w)
+            return c;
+    }
+    return NULL;
+}
+
+#endif /* LAYER_CONFIGS_H */
diff --git a/backends/cadence/vision/config_generator/layer_configs_cache.h b/backends/cadence/vision/config_generator/layer_configs_cache.h
new file mode 100644
index 00000000000..c88a0b41f81
--- /dev/null
+++ b/backends/cadence/vision/config_generator/layer_configs_cache.h
@@ -0,0 +1,2403 @@
+/*
+ * layer_configs.h
+ *
+ * Auto-generated conv2d + maxpool layer configurations
+ * Generated from PTE extraction by generate_combined_configs.py
+ *
+ * DO NOT EDIT MANUALLY
+ */
+
+#ifndef LAYER_CONFIGS_H
+#define LAYER_CONFIGS_H
+
+#include <stdint.h>
+#include <stddef.h>  /* for NULL */
+
+#define IDMA_BUFFER_SIZE_DRAM0 (0)  /* 0 KB */
+#define IDMA_BUFFER_SIZE_DRAM1 (0)  /* 0 KB */
+
+/* ====================================================================== */
+/*  Conv2d configurations                                              */
+/* ====================================================================== */
+
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* kernel_name;
+    const char* config_key;
+
+    int src_dim1_size;  int src_dim2_size;  int src_dim3_size;
+    int src_dim1_pitch; int src_dim2_pitch;
+
+    int dst_dim1_size;  int dst_dim2_size;  int dst_dim3_size;
+    int dst_dim1_pitch; int dst_dim2_pitch;
+
+    int in_dim1_size;   int in_dim1_pitch;
+    int in_dim2_size;   int in_dim2_pitch;
+    int in_dim1_edge1;  int in_dim1_edge2;
+    int in_dim2_edge1;  int in_dim2_edge2;
+    int in_dim3_edge1;  int in_dim3_edge2;
+    int in_data_offset; int in_rows_firstdma;
+
+    int out_dim1_size;  int out_dim1_pitch;
+    int out_dim2_size;  int out_dim2_pitch;
+    int out_dim3_size;
+
+    int coeff_dim1_size;  int coeff_dim2_size;
+    int coeff_dim3_size;  int coeff_dim4_size;
+    int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch;
+
+    int bias_dim1_size;     int bias_dim2_size;
+    int outscale_dim1_size; int outscale_dim2_size;
+
+    int input_buffer_size;  int coeff_buffer_size;  int output_buffer_size;
+    int bias_buffer_size;   int outscale_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;  int coeff_dram;
+    int output_ping_dram; int output_pong_dram;
+    int bias_dram;        int outscale_dram;
+
+    int n_tile_size; int n_tiles; int n_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int kernel_w; int kernel_h;
+    int stride_x; int stride_y;
+    int padding;  int dilation;
+    int accum_shift; int relu_max; int relu_min;
+    int output_shift; int output_scale; int flags;
+    int input_zero_point;
+} conv_layer_config_t;
+
+#define NUM_CONV_LAYERS 29
+
+static const conv_layer_config_t CONV_LAYER_CONFIGS[] = {
+    {
+        .layer_id = 0,
+        .layer_name = "conv_7x7_s2_ic3_oc64",
+        .kernel_name = "7x7j2d1_no_dma",
+        .config_key = "3_64_64_64_7_7_32_32_2_2_3_1",
+        .src_dim1_size = 64,
+        .src_dim2_size = 64,
+        .src_dim3_size = 3,
+        .src_dim1_pitch = 64,
+        .src_dim2_pitch = 4096,
+        .dst_dim1_size = 32,
+        .dst_dim2_size = 32,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 32,
+        .dst_dim2_pitch = 1024,
+        .in_dim1_size = 64,
+        .in_dim1_pitch = 70,
+        .in_dim2_size = 35,
+        .in_dim2_pitch = 2450,
+        .in_dim1_edge1 = 3,
+        .in_dim1_edge2 = 3,
+        .in_dim2_edge1 = 3,
+        .in_dim2_edge2 = 3,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 213,
+        .in_rows_firstdma = 32,
+        .out_dim1_size = 32,
+        .out_dim1_pitch = 32,
+        .out_dim2_size = 15,
+        .out_dim2_pitch = 480,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 7,
+        .coeff_dim2_size = 7,
+        .coeff_dim3_size = 3,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 7,
+        .coeff_dim2_pitch = 49,
+        .coeff_dim3_pitch = 147,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 7350,
+        .coeff_buffer_size = 9408,
+        .output_buffer_size = 30720,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 3,
+        .output_rows = 15,
+        .input_rows = 35,
+        .kernel_w = 7,
+        .kernel_h = 7,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 3,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 1,
+        .layer_name = "conv_3x3_s1_ic64_oc64",
+        .kernel_name = "3x3j1d1_no_dma",
+        .config_key = "64_16_16_64_3_3_16_16_1_1_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 18,
+        .in_dim2_pitch = 324,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 17,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 16,
+        .out_dim2_pitch = 256,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 576,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 20736,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 16384,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 1,
+        .output_rows = 16,
+        .input_rows = 18,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 2,
+        .layer_name = "conv_1x1_s2_ic64_oc128",
+        .kernel_name = "1x1j2d1_no_dma",
+        .config_key = "64_16_16_128_1_1_8_8_2_2_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 15,
+        .in_dim2_pitch = 240,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 15,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 8,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 15360,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 1,
+        .n_tile_size_last = 128,
+        .height_tiles = 1,
+        .output_rows = 8,
+        .input_rows = 15,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 3,
+        .layer_name = "conv_3x3_s2_ic64_oc128",
+        .kernel_name = "3x3j2d1_no_dma",
+        .config_key = "64_16_16_128_3_3_8_8_2_2_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 90,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 576,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 5760,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 2,
+        .n_tile_size_last = 64,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 4,
+        .layer_name = "conv_3x3_s1_ic128_oc128",
+        .kernel_name = "3x3j1d1_no_dma",
+        .config_key = "128_8_8_128_3_3_8_8_1_1_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 40,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 5120,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 4,
+        .n_tile_size_last = 32,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 5,
+        .layer_name = "conv_1x1_s2_ic128_oc256",
+        .kernel_name = "1x1j2d1_no_dma",
+        .config_key = "128_8_8_256_1_1_4_4_2_2_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 7,
+        .in_dim2_pitch = 56,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 7,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 128,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 7168,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 4096,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 1,
+        .output_rows = 4,
+        .input_rows = 7,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 6,
+        .layer_name = "conv_3x3_s2_ic128_oc256",
+        .kernel_name = "3x3j2d1_no_dma",
+        .config_key = "128_8_8_256_3_3_4_4_2_2_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 50,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 6400,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 8,
+        .n_tile_size_last = 32,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 7,
+        .layer_name = "conv_3x3_s1_ic256_oc256",
+        .kernel_name = "3x3j1d1_no_dma",
+        .config_key = "256_4_4_256_3_3_4_4_1_1_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 24,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 7,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 6144,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 16,
+        .n_tile_size_last = 16,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 8,
+        .layer_name = "conv_1x1_s2_ic256_oc512",
+        .kernel_name = "1x1j2d1_no_dma",
+        .config_key = "256_4_4_512_1_1_2_2_2_2_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 12,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 3072,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 4,
+        .n_tile_size_last = 128,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 9,
+        .layer_name = "conv_3x3_s2_ic256_oc512",
+        .kernel_name = "3x3j2d1_no_dma",
+        .config_key = "256_4_4_512_3_3_2_2_2_2_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 30,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 7,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 7680,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 32,
+        .n_tile_size_last = 16,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 10,
+        .layer_name = "conv_3x3_s1_ic512_oc512",
+        .kernel_name = "3x3j1d1_no_dma",
+        .config_key = "512_2_2_512_3_3_2_2_1_1_1_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 5,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 4608,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 32,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 64,
+        .n_tile_size_last = 8,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 11,
+        .layer_name = "conv_1x1_s1_ic64_oc256",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "64_16_16_256_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 7,
+        .in_dim2_pitch = 112,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 7,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 7,
+        .out_dim2_pitch = 112,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 7168,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 28672,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 3,
+        .output_rows = 7,
+        .input_rows = 7,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 12,
+        .layer_name = "conv_1x1_s1_ic64_oc64",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "64_16_16_64_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 16,
+        .in_dim2_pitch = 256,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 16,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 16,
+        .out_dim2_pitch = 256,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 16384,
+        .coeff_buffer_size = 4096,
+        .output_buffer_size = 16384,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 1,
+        .output_rows = 16,
+        .input_rows = 16,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 13,
+        .layer_name = "conv_1x1_s1_ic256_oc64",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "256_16_16_64_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 10,
+        .in_dim2_pitch = 160,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 10,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 10,
+        .out_dim2_pitch = 160,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 40960,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 10240,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 2,
+        .output_rows = 10,
+        .input_rows = 10,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 14,
+        .layer_name = "conv_1x1_s2_ic256_oc512",
+        .kernel_name = "1x1j2d1_no_dma",
+        .config_key = "256_16_16_512_1_1_8_8_2_2_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 48,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12288,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 4,
+        .n_tile_size_last = 128,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 15,
+        .layer_name = "conv_1x1_s1_ic256_oc128",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "256_16_16_128_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 7,
+        .in_dim2_pitch = 112,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 7,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 7,
+        .out_dim2_pitch = 112,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 28672,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 14336,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 1,
+        .n_tile_size_last = 128,
+        .height_tiles = 3,
+        .output_rows = 7,
+        .input_rows = 7,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 16,
+        .layer_name = "conv_3x3_s2_ic128_oc128",
+        .kernel_name = "3x3j2d1_no_dma",
+        .config_key = "128_16_16_128_3_3_8_8_2_2_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 90,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 11520,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 4,
+        .n_tile_size_last = 32,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 17,
+        .layer_name = "conv_1x1_s1_ic128_oc512",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "128_8_8_512_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 128,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 4096,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 256,
+        .n_tiles = 2,
+        .n_tile_size_last = 256,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 18,
+        .layer_name = "conv_1x1_s1_ic512_oc128",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "512_8_8_128_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 2,
+        .n_tile_size_last = 64,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 19,
+        .layer_name = "conv_1x1_s2_ic512_oc1024",
+        .kernel_name = "1x1j2d1_no_dma",
+        .config_key = "512_8_8_1024_1_1_4_4_2_2_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 1024,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 24,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 1024,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 1024,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 1024,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12288,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 4096,
+        .outscale_buffer_size = 2048,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 16,
+        .n_tile_size_last = 64,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 20,
+        .layer_name = "conv_1x1_s1_ic512_oc256",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "512_8_8_256_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 4,
+        .n_tile_size_last = 64,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 21,
+        .layer_name = "conv_3x3_s2_ic256_oc256",
+        .kernel_name = "3x3j2d1_no_dma",
+        .config_key = "256_8_8_256_3_3_4_4_2_2_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 50,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12800,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 16,
+        .n_tile_size_last = 16,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 22,
+        .layer_name = "conv_1x1_s1_ic256_oc1024",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "256_4_4_1024_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 1024,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 1024,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 1024,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 1024,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 4096,
+        .outscale_buffer_size = 2048,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 8,
+        .n_tile_size_last = 128,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 23,
+        .layer_name = "conv_1x1_s1_ic1024_oc256",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "1024_4_4_256_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 8,
+        .n_tile_size_last = 32,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 24,
+        .layer_name = "conv_1x1_s2_ic1024_oc2048",
+        .kernel_name = "1x1j2d1_no_dma",
+        .config_key = "1024_4_4_2048_1_1_2_2_2_2_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 2048,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 12,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 2048,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 2048,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 2048,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12288,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 8192,
+        .outscale_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 64,
+        .n_tile_size_last = 32,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 25,
+        .layer_name = "conv_1x1_s1_ic1024_oc512",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "1024_4_4_512_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 16,
+        .n_tile_size_last = 32,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 26,
+        .layer_name = "conv_3x3_s2_ic512_oc512",
+        .kernel_name = "3x3j2d1_no_dma",
+        .config_key = "512_4_4_512_3_3_2_2_2_2_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 30,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 7,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 4608,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 15360,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 32,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 64,
+        .n_tile_size_last = 8,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 27,
+        .layer_name = "conv_1x1_s1_ic512_oc2048",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "512_2_2_2048_1_1_2_2_1_1_0_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 2048,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 2,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 4,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 2048,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 2048,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 2048,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 8192,
+        .outscale_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 32,
+        .n_tile_size_last = 64,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 28,
+        .layer_name = "conv_1x1_s1_ic2048_oc512",
+        .kernel_name = "1x1j1d1_no_dma",
+        .config_key = "2048_2_2_512_1_1_2_2_1_1_0_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 2048,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 2,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 4,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 2048,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 2048,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 32,
+        .n_tile_size_last = 16,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+};
+
+static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; }
+
+static inline const conv_layer_config_t* get_conv_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL;
+    return &CONV_LAYER_CONFIGS[layer_id];
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_params(
+    int ic, int ih, int iw,
+    int oc, int kh, int kw,
+    int oh, int ow,
+    int sy, int sx,
+    int pad, int dil)
+{
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->src_dim3_size == ic &&
+            cfg->src_dim2_size == ih &&
+            cfg->src_dim1_size == iw &&
+            cfg->dst_dim3_size == oc &&
+            cfg->coeff_dim2_size == kh &&
+            cfg->coeff_dim1_size == kw &&
+            cfg->dst_dim2_size == oh &&
+            cfg->dst_dim1_size == ow &&
+            cfg->stride_y == sy &&
+            cfg->stride_x == sx &&
+            cfg->padding == pad &&
+            cfg->dilation == dil)
+            return cfg;
+    }
+    return NULL;
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) {
+    if (config_key == NULL) return NULL;
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->config_key != NULL) {
+            const char* a = config_key;
+            const char* b = cfg->config_key;
+            while (*a && *b && *a == *b) { a++; b++; }
+            if (*a == '\0' && *b == '\0') return cfg;
+        }
+    }
+    return NULL;
+}
+
+/* ====================================================================== */
+/*  MaxPool configurations                                             */
+/* ====================================================================== */
+
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* config_key;
+
+    int src_width;  int src_height;  int channels;
+    int dst_width;  int dst_height;
+
+    int src_row_pitch;  int src_plane_pitch;
+    int dst_row_pitch;  int dst_plane_pitch;
+
+    int kernel_h;  int kernel_w;
+    int stride_h;  int stride_w;
+    int pad_h;     int pad_w;
+
+    int in_tile_w;      int in_tile_rows;   int in_tile_plane;
+    int in_data_offset;
+    int out_tile_w;     int out_tile_rows;  int out_tile_plane;
+
+    int c_tile_size;  int c_tiles;  int c_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int input_buffer_size;  int output_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;
+    int output_ping_dram; int output_pong_dram;
+} maxpool_layer_config_t;
+
+#define NUM_MAXPOOL_LAYERS 1
+
+static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = {
+    {
+        .layer_id = 0,
+        .layer_name = "maxpool_3x3s2_c64_32x32",
+        .config_key = "64_32_32_3_3_2_2_1_1",
+        .src_width = 32,
+        .src_height = 32,
+        .channels = 64,
+        .dst_width = 16,
+        .dst_height = 16,
+        .src_row_pitch = 32,
+        .src_plane_pitch = 1024,
+        .dst_row_pitch = 16,
+        .dst_plane_pitch = 256,
+        .kernel_h = 3,
+        .kernel_w = 3,
+        .stride_h = 2,
+        .stride_w = 2,
+        .pad_h = 1,
+        .pad_w = 1,
+        .in_tile_w = 34,
+        .in_tile_rows = 5,
+        .in_tile_plane = 170,
+        .in_data_offset = 35,
+        .out_tile_w = 16,
+        .out_tile_rows = 1,
+        .out_tile_plane = 16,
+        .c_tile_size = 64,
+        .c_tiles = 1,
+        .c_tile_size_last = 64,
+        .height_tiles = 16,
+        .output_rows = 1,
+        .input_rows = 3,
+        .input_buffer_size = 43520,
+        .output_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 0,
+    },
+};
+
+static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; }
+
+static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL;
+    return &MAXPOOL_LAYER_CONFIGS[layer_id];
+}
+
+static inline const maxpool_layer_config_t* get_maxpool_config_by_params(
+    int channels, int src_height, int src_width,
+    int kernel_h, int kernel_w,
+    int stride_h, int stride_w,
+    int pad_h, int pad_w)
+{
+    for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) {
+        const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i];
+        if (c->channels   == channels   &&
+            c->src_height == src_height &&
+            c->src_width  == src_width  &&
+            c->kernel_h   == kernel_h   &&
+            c->kernel_w   == kernel_w   &&
+            c->stride_h   == stride_h   &&
+            c->stride_w   == stride_w   &&
+            c->pad_h      == pad_h      &&
+            c->pad_w      == pad_w)
+            return c;
+    }
+    return NULL;
+}
+
+#endif /* LAYER_CONFIGS_H */
diff --git a/backends/cadence/vision/kernels/CMakeLists.txt b/backends/cadence/vision/kernels/CMakeLists.txt
index fa7b2b5203b..dc8d73b5d5b 100644
--- a/backends/cadence/vision/kernels/CMakeLists.txt
+++ b/backends/cadence/vision/kernels/CMakeLists.txt
@@ -8,6 +8,7 @@
 add_library(
   cadence_kernels
   kernels.cpp
+  ${EXECUTORCH_ROOT}/backends/cadence/generic/kernels/kernels.cpp
   ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/api/tensor_transposef.c
   ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/api/vsoftmaxf.c
   ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/tables/expf_tbl.c
@@ -22,7 +23,8 @@ set(_common_include_directories
 
 target_include_directories(
   cadence_kernels
-  PUBLIC . ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/include
+  PUBLIC . ${EXECUTORCH_ROOT}/backends/cadence/generic/kernels
+         ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/include
          ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/include_private
          ${_common_include_directories}
 )
diff --git a/backends/cadence/vision/operators/CMakeLists.txt b/backends/cadence/vision/operators/CMakeLists.txt
index 38e4f97f841..7e458a56e31 100644
--- a/backends/cadence/vision/operators/CMakeLists.txt
+++ b/backends/cadence/vision/operators/CMakeLists.txt
@@ -25,6 +25,12 @@ set(_aten_ops__srcs
     "${CMAKE_CURRENT_SOURCE_DIR}/op_full.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/op_view_copy.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/op_softmax.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_max_pool2d_with_indices.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/maxpool/maxpool_exec_mxnj2.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/mean/mean_exec_dma.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_add.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_mean.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_quantized_relu_out.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
@@ -41,7 +47,6 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_hardtanh.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_max_pool2d_with_indices.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mean.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mul.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp"
@@ -69,41 +74,55 @@ set(_aten_ops__srcs
 )
 add_library(aten_ops_cadence ${_aten_ops__srcs})
 target_link_libraries(aten_ops_cadence PUBLIC executorch)
-target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
+target_link_libraries(aten_ops_cadence PRIVATE xa_nnlib cadence_kernels idma)
 
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories
-    ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
-)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 
 target_include_directories(
-  aten_ops_cadence
-  PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} ${_common_include_directories}
-         ${CMAKE_CURRENT_SOURCE_DIR}/../third-party
+  aten_ops_cadence 
+  PUBLIC ${ROOT_DIR}/.. 
+  ${CMAKE_BINARY_DIR}
+  ${_common_include_directories}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/include
 )
 
 # Custom ops that are needed to run the test model.
 add_library(
   custom_ops
-  "op_quantized_linear_out.cpp"
-  "op_quantized_conv_out.cpp"
-  "op_quantized_relu_out.cpp"
-  "op_quantized_layer_norm.cpp"
-  "op_quantize_per_tensor.cpp"
-  "op_quantized_fully_connected_out.cpp"
-  "op_dequantize_per_tensor.cpp"
-  "op_quantized_matmul_out.cpp"
-  "op_requantize_out.cpp"
-  "op_im2row_out.cpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/op_quantized_conv_out.cpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/conv/conv_kernel_dispatcher.c"
+  "${CMAKE_CURRENT_SOURCE_DIR}/conv/conv_exec_7x7j2d1.c"
+  "${CMAKE_CURRENT_SOURCE_DIR}/conv/conv_exec_1x1j1d1.c"
+  "${CMAKE_CURRENT_SOURCE_DIR}/conv/conv_exec_1x1j2d1.c"
+  "${CMAKE_CURRENT_SOURCE_DIR}/conv/conv_exec_3x3j1d1.c"
+  "${CMAKE_CURRENT_SOURCE_DIR}/conv/conv_exec_3x3j2d1.c"
+  "${CMAKE_CURRENT_SOURCE_DIR}/op_quantized_relu_out.cpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/op_quantized_layer_norm.cpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/op_quantize_per_tensor.cpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/op_quantized_linear_out.cpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/op_quantized_fully_connected_out.cpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/op_dequantize_per_tensor.cpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/op_quantized_matmul_out.cpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/op_requantize_out.cpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/op_im2row_out.cpp"
 )
+target_link_libraries(custom_ops PUBLIC executorch)
+target_link_libraries(custom_ops PRIVATE xa_nnlib cadence_kernels idma xai)
+
 target_include_directories(
-  custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
-                    ${_common_include_directories}
+  custom_ops 
+  PUBLIC 
+  ${ROOT_DIR}/.. 
+  ${CMAKE_BINARY_DIR}
+  ${_common_include_directories}
+  PRIVATE 
+  ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/include
+  ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/libxai/include
+  ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/libxai_common/include
 )
 
-target_link_libraries(custom_ops PUBLIC executorch)
-target_link_libraries(custom_ops PRIVATE cadence_kernels)
-
 # Generate C++ bindings to register kernels into both PyTorch (for AOT) and
 # Executorch (for runtime). Here select all ops in functions_vision.yaml
 gen_selected_ops(
@@ -119,6 +138,3 @@ message("Generated cadence x86 files ${gen_command_sources}")
 gen_operators_lib(
   LIB_NAME "cadence_ops_lib" KERNEL_LIBS custom_ops DEPS aten_ops_cadence
 )
-
-# Link custom_ops to the generated library to ensure the symbols are available
-target_link_libraries(cadence_ops_lib PUBLIC custom_ops)
diff --git a/backends/cadence/vision/operators/TARGETS b/backends/cadence/vision/operators/TARGETS
new file mode 100644
index 00000000000..67f2bab681a
--- /dev/null
+++ b/backends/cadence/vision/operators/TARGETS
@@ -0,0 +1,5 @@
+load("targets.bzl", "define_common_targets")
+
+oncall("odai_jarvis")
+
+define_common_targets()
diff --git a/backends/cadence/vision/operators/conv/conv_exec_1x1j1d1.c b/backends/cadence/vision/operators/conv/conv_exec_1x1j1d1.c
new file mode 100644
index 00000000000..ad92342bd48
--- /dev/null
+++ b/backends/cadence/vision/operators/conv/conv_exec_1x1j1d1.c
@@ -0,0 +1,1023 @@
+#include "kernel_executors.h"
+#include "memory_manager.h"
+#include "dma.h"
+#include "utils.h"
+#include <xai_cnn_api.h>
+#include <string.h>
+#include <xtensa/hal.h>
+
+// VQ (per-channel output scaling) DMA version
+XAI_ERR_TYPE conv_exec_1x1j1d1VQ(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    int8_t* outScale_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // SECTION 1: DRAM Buffer Allocation
+    // ========================================================================
+    int dram0_used = 0;
+    int dram1_used = 0;
+    
+    int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, 
+                                              config->input_ping_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, 
+                                              config->input_pong_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, 
+                                             config->coeff_dram, 
+                                             &dram0_used, &dram1_used);
+    int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, 
+                                               config->output_ping_dram, 
+                                               &dram0_used, &dram1_used);
+    int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, 
+                                               config->output_pong_dram, 
+                                               &dram0_used, &dram1_used);
+    int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, 
+                                            config->bias_dram, 
+                                            &dram0_used, &dram1_used);
+    int8_t* p_outscale = allocate_dram_buffer(config->outscale_buffer_size, 
+                                                config->outscale_dram, 
+                                                &dram0_used, &dram1_used);
+    
+    if (!p_input0 || !p_input1 || !p_coeff || 
+        !p_output0 || !p_output1 || !p_bias || !p_outscale) {
+        return (-1);
+    }
+    
+    // ========================================================================
+    // SECTION 2: Initialize XAI Tile Descriptors
+    // ========================================================================
+    xai_tile3D tile_input;
+    xai_size3D frame_size_input;
+    xai_tile4D tile_coeff;
+    xai_array tile_bias;
+    xai_array tile_outscale;
+    xai_tile3D tile_output;
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    
+    // Transfer constant data (all buffers are 64-byte aligned by test harness)
+    dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size);
+    dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size);
+    dma_1dm(0, outScale_ptr, p_outscale, config->outscale_buffer_size);
+    
+    // Initialize input buffer and load first tile
+    _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size);
+    
+    // First input transfer: load IN_ROWS_FIRSTDMA rows at offset 0
+    dma_3dm(1,
+            (void*)src,
+            (void*)&p_input0[0],  // No offset for 1x1
+            config->src_dim1_pitch,
+            config->in_dim1_pitch,
+            config->src_dim2_pitch,
+            config->in_dim2_pitch,
+            config->src_dim1_size,
+            config->in_rows_firstdma,
+            config->src_dim3_size);
+    
+    // Wait for all initial DMA transfers to complete
+    idma_hw_wait_all(0);  // coeff + bias + outscale on ch0
+    idma_hw_wait_all(1);  // input on ch1
+    
+    // ========================================================================
+    // Configure Input Tile Descriptor
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[0]);  // No offset for 1x1
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, 0);
+    // DIM2 = IN_ROWS_FIRSTDMA (no edge to subtract for 1x1)
+    XAI_TILE3D_SET_DIM2(&tile_input, config->in_rows_firstdma);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0);
+    
+    // Frame size for edge extension (not actually needed for 1x1, but harmless)
+    frame_size_input.dim1Size = config->src_dim1_size;
+    frame_size_input.dim2Size = config->src_dim2_size;
+    frame_size_input.dim3Size = config->src_dim3_size;
+    
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor (4D: WÃ—HÃ—CÃ—N)
+    // ========================================================================
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Bias Array
+    // ========================================================================
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Output Scale Array
+    // ========================================================================
+    XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, p_outscale);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->outscale_buffer_size);
+    XAI_ARRAY_SET_DATA_PTR(&tile_outscale, p_outscale);
+    XAI_ARRAY_SET_WIDTH(&tile_outscale, config->n_tile_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1);
+    XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16);
+    XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Output Tile Descriptor
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->output_rows);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->n_tile_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+    
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+    
+    // ========================================================================
+    // SECTION 3: Tiled Execution Loop (N-tiles Ã— H-tiles)
+    // ========================================================================
+    int last_tile = 1;
+    
+    for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) {
+        int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1);
+        int current_n_size = (idx_n < config->n_tiles - 1) ? 
+                             config->n_tile_size : config->n_tile_size_last;
+        
+        // Update coefficient/bias/outscale for N-tile
+        XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n);
+        XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size);
+        
+        XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]);
+        XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size);
+        XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size);
+        
+        XAI_ARRAY_SET_DATA_PTR(&tile_outscale, &p_outscale[config->n_tile_size * 2 * idx_n]);
+        XAI_ARRAY_SET_WIDTH(&tile_outscale, current_n_size);
+        XAI_ARRAY_SET_CAPACITY(&tile_outscale, current_n_size);
+        
+        XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n);
+        XAI_TILE3D_SET_DIM3(&tile_output, current_n_size);
+        
+        // Process vertical tiles
+        for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) {
+            int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1);
+            
+            // Calculate actual rows for this height tile (handle last tile edge case)
+            int current_output_rows = (idx_h < config->height_tiles - 1) ? 
+                                      config->output_rows : 
+                                      (config->dst_dim2_size - (config->output_rows * idx_h));
+            // current_output_rows used below for DMA size  // Used for potential future tile dimension adjustments
+            
+            // ================================================================
+            // Prefetch Next Input Tile (Ping-Pong Buffering)
+            // ================================================================
+            // Prefetch next input tile
+            // For stride-1: each output row comes from one input row, so load 2 input rows for 2 output rows
+            if (!last_h_tile) {
+                int temp_idx_h;
+                inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1);
+                _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size);
+
+                dma_3dm(1,
+                        &(src[(config->stride_y * config->output_rows * temp_idx_h) * config->src_dim1_size]),
+                        &(p_input1[0]),  // No offset for 1x1 kernel (IN_DATA_OFFSET = 0)
+                        config->src_dim1_pitch,
+                        config->in_dim1_pitch,
+                        config->src_dim2_pitch,
+                        config->in_dim2_pitch,
+                        config->src_dim1_size,
+                        config->input_rows,
+                        config->src_dim3_size);
+            }
+            
+            // ================================================================
+            // Update Tile Descriptors for Current Iteration
+            // ================================================================
+            // Update tile descriptors for current height tile
+            // For 1x1 kernel: data always starts at buffer offset 0 (no edge padding)
+            XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+            XAI_TILE3D_SET_DATA_PTR(&tile_input, &(p_input0[0]));  // IN_DATA_OFFSET = 0 for 1x1 kernel
+            XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->output_rows)*(idx_h));
+            XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1);
+            XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0]));
+            XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->output_rows)*(idx_h));
+            XAI_TILE3D_SET_DIM2(&tile_output, current_output_rows);
+            
+            // ================================================================
+            // Perform Convolution (no edge extension needed for 1x1)
+            // ================================================================
+            XAI_ERR_TYPE status = xaiConvolvedVQ3D_S_1x1j1d1_S8S8IX_MOW_WHD(
+                                        &(tile_input),
+                                        &(tile_coeff),
+                                        &(tile_bias),
+                                        &(tile_outscale),
+                                        &(tile_output),
+                                        &(params));
+            
+            if (status != XAI_ERR_OK) {
+                return status;
+            }
+
+            // Prefetch next coefficient tile (if needed)
+            if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) {
+                int temp_idx_n;
+                int temp_idx_h;
+                inc_iter_to_temp(&(temp_idx_n),idx_n, config->n_tiles, inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1));
+                dma_1dm(0,/* src */ (((coeff_ptr) + ((config->coeff_buffer_size)*(temp_idx_n)))), /* dst */ &(p_coeff[0]), /* row size */ (((temp_idx_n) < (config->n_tiles-1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last)));
+            }
+
+            // ================================================================
+            // Write Output Tile to System Memory
+            // ================================================================
+            // Write output tile to system memory
+            // dma_2dm params: src, dst, src_pitch, dst_pitch, row_size, num_rows
+            // row_size = actual bytes for this tile (may be less for last height tile)
+            // num_rows = current_n_size (number of output channels in this N-tile)
+            int output_row_bytes = config->out_dim1_pitch * current_output_rows;
+            dma_2dm(0,
+                    &(p_output1[0]),
+                    &dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n))+((config->out_dim1_pitch * config->output_rows)*(idx_h))],
+                    config->out_dim2_pitch,
+                    config->dst_dim2_pitch,
+                    output_row_bytes,
+                    current_n_size);
+
+            swap_buffers(&(p_output0), &(p_output1));
+            swap_buffers(&(p_input0), &(p_input1));
+        }
+    }
+    
+    // Wait for final output DMA to complete before returning
+    idma_hw_wait_all(0);
+
+    return XAI_ERR_OK;
+}
+
+// VQ (per-channel output scaling) cache version
+// All data stays in system memory and is accessed through processor cache
+XAI_ERR_TYPE conv_exec_1x1j1d1VQ_cache(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    int8_t* outScale_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // Setup source raw tile descriptor (points to raw input without padding)
+    // ========================================================================
+    xai_tile3D src_raw;
+    XAI_TILE3D_SET_BUFF_PTR(&src_raw, src);
+    XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&src_raw, src);
+    XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0);
+
+    // ========================================================================
+    // Get padded input buffer from allocator (shared across cache kernels)
+    // For 1x1 convolution, edges are typically 0, but we still use the pattern
+    // ========================================================================
+    int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2;
+    int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1);
+    int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2;
+    int dim2_pitch = dim1_pitch * padded_dim2;
+    int input_buffer_size = dim2_pitch * config->src_dim3_size;
+    
+    // Get shared padded input buffer from allocator
+    int8_t* padded_input = get_cache_padded_input();
+    
+    if (input_buffer_size > (int)get_cache_padded_input_size()) {
+        return XAI_ERR_DATASIZE;
+    }
+    
+    // Zero-fill the padded buffer
+    memset(padded_input, config->input_zero_point, input_buffer_size);
+
+    // ========================================================================
+    // Setup padded input tile descriptor
+    // ========================================================================
+    int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1;
+    
+    xai_tile3D tile_input;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0);
+
+    // ========================================================================
+    // Copy raw input to padded buffer and extend edges
+    // ========================================================================
+#ifdef USE_DMA_FOR_CACHE_COPY
+    // Use DMA 3D transfer to copy input data into padded buffer at data_offset
+    dma_3dm(0,
+            /* src */           src,
+            /* dst */           &padded_input[data_offset],
+            /* src_row_pitch */ config->src_dim1_pitch,
+            /* dst_row_pitch */ dim1_pitch,
+            /* src_tile_pitch */ config->src_dim2_pitch,
+            /* dst_tile_pitch */ dim2_pitch,
+            /* row_sz */        config->src_dim1_size,
+            /* nrows */         config->src_dim2_size,
+            /* ntiles */        config->src_dim3_size);
+#else
+    // Use library tile copy function (no DMA required)
+    // Safe manual copy: avoids SIMD overread near source buffer boundary
+    for (int d = 0; d < config->src_dim3_size; d++) {
+        for (int h = 0; h < config->src_dim2_size; h++) {
+            memcpy(&padded_input[data_offset + d * dim2_pitch + h * dim1_pitch],
+                   &src[d * config->src_dim2_pitch + h * config->src_dim1_pitch],
+                   config->src_dim1_size);
+        }
+    }
+    (void)src_raw;
+#endif
+    
+    xai_size3D frame_size;
+    frame_size.dim1Size = config->dst_dim1_size * config->stride_x;
+    frame_size.dim2Size = config->dst_dim2_size * config->stride_y;
+    frame_size.dim3Size = config->src_dim3_size;
+    
+    xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size);
+
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor (4D: W×H×C×N)  
+    // ========================================================================
+    xai_tile4D tile_coeff;
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr);    
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);   
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Bias Array 
+    // ========================================================================
+    xai_array tile_bias;
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr);   
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Output Scale Array
+    // ========================================================================
+    xai_array tile_outscale;
+    XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, outScale_ptr);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->dst_dim3_size * 2);
+    XAI_ARRAY_SET_DATA_PTR(&tile_outscale, outScale_ptr);
+    XAI_ARRAY_SET_WIDTH(&tile_outscale, config->dst_dim3_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1);
+    XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16);
+    XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Output Tile Descriptor (points to system memory)
+    // ========================================================================
+    xai_tile3D tile_output;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+
+    // ========================================================================
+    // Execute convolution using generic system-memory API
+    // This version accesses data through the processor cache
+    // ========================================================================
+    XAI_ERR_TYPE status = xaiConvolvedVQ3D(&tile_input, &tile_coeff, &tile_bias, 
+                                            &tile_outscale, &tile_output, &params);
+
+    // Writeback output from cache to system memory for DMA coherency
+    xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size);
+
+    return status;
+}
+
+// ============================================================================
+// Non-VQ (per-tensor output scaling) versions
+// ============================================================================
+
+// Non-VQ DMA version - per-tensor output scaling
+XAI_ERR_TYPE conv_exec_1x1j1d1(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // SECTION 1: DRAM Buffer Allocation (no outscale buffer needed)
+    // ========================================================================
+    int dram0_used = 0;
+    int dram1_used = 0;
+    
+    int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, 
+                                              config->input_ping_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, 
+                                              config->input_pong_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, 
+                                             config->coeff_dram, 
+                                             &dram0_used, &dram1_used);
+    int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, 
+                                               config->output_ping_dram, 
+                                               &dram0_used, &dram1_used);
+    int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, 
+                                               config->output_pong_dram, 
+                                               &dram0_used, &dram1_used);
+    int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, 
+                                            config->bias_dram, 
+                                            &dram0_used, &dram1_used);
+    
+    if (!p_input0 || !p_input1 || !p_coeff || 
+        !p_output0 || !p_output1 || !p_bias) {
+        return (-1);
+    }
+    
+    // ========================================================================
+    // SECTION 2: Initialize XAI Tile Descriptors
+    // ========================================================================
+    xai_tile3D tile_input;
+    xai_size3D frame_size_input;
+    xai_tile4D tile_coeff;
+    xai_array tile_bias;
+    xai_tile3D tile_output;
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    
+    /* Initialize DMA engines */
+    dma_3dm_init(1);
+    dma_2dm_init(0);
+    
+    // Transfer constant data (no outscale)
+    dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size);
+    dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size);
+    
+    // Initialize input buffer and load first tile
+    _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size);
+    
+    // First input transfer
+    dma_3dm(1,
+            (void*)src,
+            (void*)&p_input0[0],
+            config->src_dim1_pitch,
+            config->in_dim1_pitch,
+            config->src_dim2_pitch,
+            config->in_dim2_pitch,
+            config->src_dim1_size,
+            config->in_rows_firstdma,
+            config->src_dim3_size);
+    
+    // Wait for all initial DMA transfers to complete
+    idma_hw_wait_all(0);  // coeff + bias on ch0
+    idma_hw_wait_all(1);  // input on ch1
+    
+    // ========================================================================
+    // Configure Input Tile Descriptor
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[0]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, 0);
+    XAI_TILE3D_SET_DIM2(&tile_input, config->in_rows_firstdma);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0);
+    
+    frame_size_input.dim1Size = config->src_dim1_size;
+    frame_size_input.dim2Size = config->src_dim2_size;
+    frame_size_input.dim3Size = config->src_dim3_size;
+    
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor
+    // ========================================================================
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Bias Array
+    // ========================================================================
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Output Tile Descriptor
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->output_rows);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->n_tile_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+    
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+    
+    // ========================================================================
+    // SECTION 3: Tiled Execution Loop
+    // ========================================================================
+    int last_tile = 1;
+    
+    for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) {
+        int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1);
+        int current_n_size = (idx_n < config->n_tiles - 1) ? 
+                             config->n_tile_size : config->n_tile_size_last;
+        
+        // Update coefficient/bias for N-tile (no outscale)
+        XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n);
+        XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size);
+        
+        XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]);
+        XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size);
+        XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size);
+        
+        XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n);
+        XAI_TILE3D_SET_DIM3(&tile_output, current_n_size);
+        
+        for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) {
+            int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1);
+            
+            int current_output_rows = (idx_h < config->height_tiles - 1) ? 
+                                      config->output_rows : 
+                                      (config->dst_dim2_size - (config->output_rows * idx_h));
+            
+            // ================================================================
+            // Prefetch Next Input Tile
+            // ================================================================
+            if (!last_h_tile) {
+                int temp_idx_h;
+                inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1);
+                _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size);
+
+                dma_3dm(1,
+                        &(src[(config->stride_y * config->output_rows * temp_idx_h) * config->src_dim1_size]),
+                        &(p_input1[0]),
+                        config->src_dim1_pitch,
+                        config->in_dim1_pitch,
+                        config->src_dim2_pitch,
+                        config->in_dim2_pitch,
+                        config->src_dim1_size,
+                        config->input_rows,
+                        config->src_dim3_size);
+            }
+            
+            // ================================================================
+            // Update Tile Descriptors
+            // ================================================================
+            XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+            XAI_TILE3D_SET_DATA_PTR(&tile_input, &(p_input0[0]));
+            XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->output_rows)*(idx_h));
+            XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1);
+            XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0]));
+            XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->output_rows)*(idx_h));
+            XAI_TILE3D_SET_DIM2(&tile_output, current_output_rows);
+            
+            // ================================================================
+            // Perform Convolution (non-VQ API)
+            // ================================================================
+            XAI_ERR_TYPE status = xaiConvolved3D_S_1x1j1d1_S8S8IX_MOW_WHD(
+                                        &(tile_input),
+                                        &(tile_coeff),
+                                        &(tile_bias),
+                                        &(tile_output),
+                                        &(params));
+            
+            if (status != XAI_ERR_OK) {
+                return status;
+            }
+
+            // Prefetch next coefficient tile
+            if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) {
+                int temp_idx_n;
+                int temp_idx_h;
+                inc_iter_to_temp(&(temp_idx_n),idx_n, config->n_tiles, inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1));
+                dma_1dm(0, (coeff_ptr + (config->coeff_buffer_size * temp_idx_n)), &(p_coeff[0]), (((temp_idx_n) < (config->n_tiles-1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last)));
+            }
+
+            // ================================================================
+            // Write Output Tile to System Memory
+            // ================================================================
+            int output_row_bytes = config->out_dim1_pitch * current_output_rows;
+            dma_2dm(0,
+                    &(p_output1[0]),
+                    &dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n))+((config->out_dim1_pitch * config->output_rows)*(idx_h))],
+                    config->out_dim2_pitch,
+                    config->dst_dim2_pitch,
+                    output_row_bytes,
+                    current_n_size);
+
+            swap_buffers(&(p_output0), &(p_output1));
+            swap_buffers(&(p_input0), &(p_input1));
+        }
+    }
+    
+    // Wait for final output DMA to complete before returning
+    idma_hw_wait_all(0);
+
+    return XAI_ERR_OK;
+}
+
+// Non-VQ cache version - per-tensor output scaling
+XAI_ERR_TYPE conv_exec_1x1j1d1_cache(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // Setup source raw tile descriptor (points to raw input without padding)
+    // ========================================================================
+    xai_tile3D src_raw;
+    XAI_TILE3D_SET_BUFF_PTR(&src_raw, src);
+    XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&src_raw, src);
+    XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0);
+
+    // ========================================================================
+    // Get padded input buffer from allocator (SIMD-aligned dim1_pitch required)
+    // ========================================================================
+    int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2;
+    int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1);
+    int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2;
+    int dim2_pitch = dim1_pitch * padded_dim2;
+    int input_buffer_size = dim2_pitch * config->src_dim3_size;
+
+    int8_t* padded_input = get_cache_padded_input();
+
+    if (input_buffer_size > (int)get_cache_padded_input_size()) {
+        return XAI_ERR_DATASIZE;
+    }
+
+    memset(padded_input, config->input_zero_point, input_buffer_size);
+
+    // ========================================================================
+    // Setup padded input tile descriptor
+    // ========================================================================
+    int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1;
+
+    xai_tile3D tile_input;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0);
+
+    // Copy raw input to padded buffer and extend edges
+    // Safe manual copy: avoids SIMD overread near source buffer boundary
+    for (int d = 0; d < config->src_dim3_size; d++) {
+        for (int h = 0; h < config->src_dim2_size; h++) {
+            memcpy(&padded_input[data_offset + d * dim2_pitch + h * dim1_pitch],
+                   &src[d * config->src_dim2_pitch + h * config->src_dim1_pitch],
+                   config->src_dim1_size);
+        }
+    }
+    (void)src_raw;
+
+    xai_size3D frame_size;
+    frame_size.dim1Size = config->dst_dim1_size * config->stride_x;
+    frame_size.dim2Size = config->dst_dim2_size * config->stride_y;
+    frame_size.dim3Size = config->src_dim3_size;
+
+    xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size);
+
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor
+    // ========================================================================
+    xai_tile4D tile_coeff;
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr);    
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);   
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Bias Array
+    // ========================================================================
+    xai_array tile_bias;
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr);   
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Output Tile Descriptor
+    // ========================================================================
+    xai_tile3D tile_output;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+
+    // ========================================================================
+    // Execute convolution (non-VQ API)
+    // ========================================================================
+    XAI_ERR_TYPE status = xaiConvolved3D(&tile_input, &tile_coeff, &tile_bias, 
+                                          &tile_output, &params);
+
+    // Writeback output from cache to system memory for DMA coherency
+    xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size);
+
+    return status;
+}
\ No newline at end of file
diff --git a/backends/cadence/vision/operators/conv/conv_exec_1x1j2d1.c b/backends/cadence/vision/operators/conv/conv_exec_1x1j2d1.c
new file mode 100644
index 00000000000..2badf62536c
--- /dev/null
+++ b/backends/cadence/vision/operators/conv/conv_exec_1x1j2d1.c
@@ -0,0 +1,1132 @@
+#include "kernel_executors.h"
+#include "memory_manager.h"
+#include "dma.h"
+#include "utils.h"
+#include <xai_cnn_api.h>
+#include <string.h>
+#include <xtensa/hal.h>
+
+// VQ (per-channel output scaling) DMA version
+XAI_ERR_TYPE conv_exec_1x1j2d1VQ(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    int8_t* outScale_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // SECTION 1: DRAM Buffer Allocation
+    // ========================================================================
+    int dram0_used = 0;
+    int dram1_used = 0;
+    
+    int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, 
+                                              config->input_ping_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, 
+                                              config->input_pong_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, 
+                                             config->coeff_dram, 
+                                             &dram0_used, &dram1_used);
+    int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, 
+                                               config->output_ping_dram, 
+                                               &dram0_used, &dram1_used);
+    int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, 
+                                               config->output_pong_dram, 
+                                               &dram0_used, &dram1_used);
+    int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, 
+                                            config->bias_dram, 
+                                            &dram0_used, &dram1_used);
+    int8_t* p_outscale = allocate_dram_buffer(config->outscale_buffer_size, 
+                                                config->outscale_dram, 
+                                                &dram0_used, &dram1_used);
+    
+    if (!p_input0 || !p_input1 || !p_coeff || 
+        !p_output0 || !p_output1 || !p_bias || !p_outscale) {
+        return (-1);
+    }
+    
+    // ========================================================================
+    // SECTION 2: Initialize XAI Tile Descriptors
+    // ========================================================================
+    xai_tile3D tile_input;
+    xai_size3D frame_size_input;
+    xai_tile4D tile_coeff;
+    xai_array tile_bias;
+    xai_array tile_outscale;
+    xai_tile3D tile_output;
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    
+    // Transfer constant data (all buffers are 64-byte aligned by test harness)
+    dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size);
+    dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size);
+    dma_1dm(0, outScale_ptr, p_outscale, config->outscale_buffer_size);
+    
+    // Initialize input buffer and load first tile
+    // Note: For 1x1 kernel, IN_DATA_OFFSET = 0, no edge padding needed
+    _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size);
+    
+    // First DMA: load IN_ROWS_FIRSTDMA rows at offset 0 (no edge padding)
+    dma_3dm(1,
+            (void*)src,
+            (void*)&p_input0[config->in_data_offset],  // in_data_offset = 0 for 1x1
+            config->src_dim1_pitch,
+            config->in_dim1_pitch,
+            config->src_dim2_pitch,
+            config->in_dim2_pitch,
+            config->src_dim1_size,
+            config->in_rows_firstdma,
+            config->src_dim3_size);
+    
+    // Wait for all initial DMA transfers to complete
+    idma_hw_wait_all(0);  // coeff + bias + outscale on ch0
+    idma_hw_wait_all(1);  // input on ch1
+    
+    // ========================================================================
+    // Configure Input Tile Descriptor
+    // For 1x1: no edge padding, IN_DATA_OFFSET = 0
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]);  // 0 for 1x1
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);  // 0 for 1x1
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);  // 0 for 1x1
+    // DIM2 = IN_ROWS_FIRSTDMA - EDGE1 = IN_ROWS_FIRSTDMA (edge=0 for 1x1)
+    XAI_TILE3D_SET_DIM2(&tile_input, config->in_rows_firstdma - config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);  // 0 for 1x1
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);  // 0 for 1x1
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, config->in_dim3_edge1);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, config->in_dim3_edge2);
+    
+    // Frame size for edge extension (even though edge=0, still needed)
+    frame_size_input.dim1Size = config->src_dim1_size;
+    frame_size_input.dim2Size = config->src_dim2_size;
+    frame_size_input.dim3Size = config->src_dim3_size;
+    
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor (4D: W×H×C×N)
+    // ========================================================================
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Bias Array
+    // ========================================================================
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Output Scale Array
+    // ========================================================================
+    XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, p_outscale);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->outscale_buffer_size);
+    XAI_ARRAY_SET_DATA_PTR(&tile_outscale, p_outscale);
+    XAI_ARRAY_SET_WIDTH(&tile_outscale, config->n_tile_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1);
+    XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16);
+    XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Output Tile Descriptor
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->output_rows);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->n_tile_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+    
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+    
+    // ========================================================================
+    // SECTION 3: Tiled Execution Loop (N-tiles × H-tiles)
+    // ========================================================================
+    int last_tile = 1;
+    
+    for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) {
+        int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1);
+        int current_n_size = (idx_n < config->n_tiles - 1) ? 
+                             config->n_tile_size : config->n_tile_size_last;
+        
+        // Update coefficient/bias/outscale for N-tile
+        XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n);
+        XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size);
+        
+        XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]);
+        XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size);
+        XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size);
+        
+        XAI_ARRAY_SET_DATA_PTR(&tile_outscale, &p_outscale[config->n_tile_size * 2 * idx_n]);
+        XAI_ARRAY_SET_WIDTH(&tile_outscale, current_n_size);
+        XAI_ARRAY_SET_CAPACITY(&tile_outscale, current_n_size);
+        
+        XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n);
+        XAI_TILE3D_SET_DIM3(&tile_output, current_n_size);
+        
+        // Process vertical tiles
+        for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) {
+            int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1);
+            
+            // Calculate actual rows for this height tile (handle last tile edge case)
+            int current_output_rows = (idx_h < config->height_tiles - 1) ? 
+                                      config->output_rows : 
+                                      (config->dst_dim2_size - (config->output_rows * idx_h));
+            (void)current_output_rows;  // Used for potential future tile dimension adjustments
+            
+            // ================================================================
+            // Prefetch Next Input Tile (Ping-Pong Buffering)
+            // ================================================================
+            if (!last_h_tile) {
+                int temp_idx_h;
+                inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1);
+                _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size);
+
+                dma_3dm(1,
+                        &(src[(config->stride_y * config->output_rows * temp_idx_h) * config->src_dim1_size]),
+                        &(p_input1[0]),  // No offset for 1x1 kernel (IN_DATA_OFFSET = 0)
+                        config->src_dim1_pitch,
+                        config->in_dim1_pitch,
+                        config->src_dim2_pitch,
+                        config->in_dim2_pitch,
+                        config->src_dim1_size,
+                        config->input_rows,
+                        config->src_dim3_size);
+            }
+            
+            // ================================================================
+            // Update Tile Descriptors for Current Iteration
+            // ================================================================
+            // Update tile descriptors for current height tile
+            // For 1x1 kernel: data always starts at buffer offset 0 (no edge padding)
+            XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+            XAI_TILE3D_SET_DATA_PTR(&tile_input, &(p_input0[0]));  // IN_DATA_OFFSET = 0 for 1x1 kernel
+            XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->output_rows)*(idx_h));
+            XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1);
+            XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0]));
+            XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->output_rows)*(idx_h));
+            XAI_TILE3D_SET_DIM2(&tile_output, current_output_rows);
+            
+            // ================================================================
+            // Perform Convolution
+            // ================================================================
+            XAI_ERR_TYPE status = xaiConvolvedVQ3D_S_1x1j2d1_S8S8IX_MOW_WHD(
+                                        &(tile_input),
+                                        &(tile_coeff),
+                                        &(tile_bias),
+                                        &(tile_outscale),
+                                        &(tile_output),
+                                        &(params));
+            
+            if (status != XAI_ERR_OK) {
+                return status;
+            }
+
+            // ================================================================
+            // Prefetch next coefficient tile (if needed for multi-N-tile layers)
+            // ================================================================
+            if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) {
+                int temp_idx_n;
+                int temp_idx_h;
+
+                inc_iter_to_temp(&(temp_idx_n), idx_n, config->n_tiles, inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1));
+
+                dma_1dm(0,
+                        /* src */ (((coeff_ptr) + ((config->coeff_buffer_size)*(temp_idx_n)))),
+                        /* dst */ &(p_coeff[0]),
+                        /* row size */ (((temp_idx_n) < (config->n_tiles-1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last)));
+            }
+
+            // ================================================================
+            // Write Output Tile to System Memory
+            // ================================================================
+            // Write output tile to system memory
+            // dma_2dm params: src, dst, src_pitch, dst_pitch, row_size, num_rows
+            // row_size = out_dim1_pitch * current_output_rows (actual valid bytes per channel)
+            // num_rows = current_n_size (number of output channels in this N-tile)
+            // Note: src_stride stays at out_dim2_pitch since local buffer has fixed pitch
+            int output_row_bytes = config->out_dim1_pitch * current_output_rows;
+            dma_2dm(0,
+                    &(p_output1[0]),
+                    &dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n))+((config->out_dim1_pitch * config->output_rows)*(idx_h))],
+                    config->out_dim2_pitch,
+                    config->dst_dim2_pitch,
+                    output_row_bytes,
+                    current_n_size);
+
+            swap_buffers(&(p_output0), &(p_output1));
+            swap_buffers(&(p_input0), &(p_input1));
+        }
+    }
+    
+    // Wait for final output DMA to complete before returning
+    idma_hw_wait_all(0);
+
+    return XAI_ERR_OK;
+}
+
+// VQ (per-channel output scaling) cache version
+// All data stays in system memory and is accessed through processor cache
+XAI_ERR_TYPE conv_exec_1x1j2d1VQ_cache(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    int8_t* outScale_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // Setup source raw tile descriptor (points to raw input without padding)
+    // ========================================================================
+    xai_tile3D src_raw;
+    XAI_TILE3D_SET_BUFF_PTR(&src_raw, src);
+    XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&src_raw, src);
+    XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0);
+
+    // ========================================================================
+    // Get padded input buffer from allocator (shared across cache kernels)
+    // For 1x1 convolution, edges are typically 0, but we still use the pattern
+    // ========================================================================
+    int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2;
+    int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1);
+    int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2;
+    int dim2_pitch = dim1_pitch * padded_dim2;
+    int input_buffer_size = dim2_pitch * config->src_dim3_size;
+    
+    // Get shared padded input buffer from allocator
+    int8_t* padded_input = get_cache_padded_input();
+    
+    if (input_buffer_size > (int)get_cache_padded_input_size()) {
+        return XAI_ERR_DATASIZE;
+    }
+    
+    memset(padded_input, config->input_zero_point, input_buffer_size);
+
+    // ========================================================================
+    // Setup padded input tile descriptor
+    // ========================================================================
+    int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1;
+    
+    xai_tile3D tile_input;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0);
+
+    // ========================================================================
+    // Copy raw input to padded buffer and extend edges
+    // ========================================================================
+#ifdef USE_DMA_FOR_CACHE_COPY
+    // Use DMA 3D transfer to copy input data into padded buffer at data_offset
+    dma_3dm(0,
+            /* src */           src,
+            /* dst */           &padded_input[data_offset],
+            /* src_row_pitch */ config->src_dim1_pitch,
+            /* dst_row_pitch */ dim1_pitch,
+            /* src_tile_pitch */ config->src_dim2_pitch,
+            /* dst_tile_pitch */ dim2_pitch,
+            /* row_sz */        config->src_dim1_size,
+            /* nrows */         config->src_dim2_size,
+            /* ntiles */        config->src_dim3_size);
+#else
+    // Use library tile copy function (no DMA required)
+    xaiCopyTile3D(&src_raw, &tile_input, true);
+#endif
+    
+    xai_size3D frame_size;
+    frame_size.dim1Size = config->dst_dim1_size * config->stride_x;
+    frame_size.dim2Size = config->dst_dim2_size * config->stride_y;
+    frame_size.dim3Size = config->src_dim3_size;
+    
+    xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size);
+
+    // ========================================================================
+    // Pre-subsample input when numInCh > 64 (gather instruction workaround)
+    // The XAI 1x1j2d1 kernel uses gather instructions for numInCh > 64.
+    // Gather only works on local DRAM, not system memory (cache mode).
+    // Fix: apply stride manually, then call with stride=1 → dispatches to
+    // 1x1j1d1 kernel which has an aligned path that uses regular loads.
+    // ========================================================================
+    int need_prestride = (config->src_dim3_size > 2 * XCHAL_IVPN_SIMD_WIDTH)
+                         && (config->stride_x > 1);
+    if (need_prestride) {
+        int pre_outW = config->dst_dim1_size;
+        int pre_outH = config->dst_dim2_size;
+        int pre_d1_pitch = (pre_outW + 2*XCHAL_IVPN_SIMD_WIDTH - 1)
+                           & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1);
+        int pre_d2_pitch = pre_d1_pitch * pre_outH;
+        int pre_buf_size = pre_d2_pitch * config->src_dim3_size;
+        int stride = config->stride_x;
+
+        // Place pre-subsampled data after padded input (128-byte aligned)
+        int pre_offset = (input_buffer_size + 127) & ~127;
+        int8_t* pre_input = &padded_input[pre_offset];
+
+        if ((pre_offset + pre_buf_size) > (int)get_cache_padded_input_size()) {
+            return XAI_ERR_DATASIZE;
+        }
+
+        memset(pre_input, config->input_zero_point, pre_buf_size);
+
+        // Subsample: pick every stride-th pixel in W and H
+        int8_t* orig = &padded_input[data_offset];
+        for (int d = 0; d < config->src_dim3_size; d++) {
+            for (int oy = 0; oy < pre_outH; oy++) {
+                for (int ox = 0; ox < pre_outW; ox++) {
+                    pre_input[d * pre_d2_pitch + oy * pre_d1_pitch + ox] =
+                        orig[d * dim2_pitch + (stride * oy) * dim1_pitch + stride * ox];
+                }
+            }
+        }
+
+        // Update tile_input to point to pre-subsampled data
+        XAI_TILE3D_SET_BUFF_PTR(&tile_input, pre_input);
+        XAI_TILE3D_SET_BUFF_SIZE(&tile_input, pre_buf_size);
+        XAI_TILE3D_SET_DATA_PTR(&tile_input, pre_input);
+        XAI_TILE3D_SET_DIM1_PITCH(&tile_input, pre_d1_pitch);
+        XAI_TILE3D_SET_DIM2_PITCH(&tile_input, pre_d2_pitch);
+        XAI_TILE3D_SET_DIM1(&tile_input, pre_outW);
+        XAI_TILE3D_SET_DIM2(&tile_input, pre_outH);
+        XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, 0);
+        XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, 0);
+        XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, 0);
+        XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, 0);
+    }
+
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor (4D: W×H×C×N)  
+    // ========================================================================
+    xai_tile4D tile_coeff;
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr);    
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);   
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Bias Array 
+    // ========================================================================
+    xai_array tile_bias;
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr);   
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Output Scale Array
+    // ========================================================================
+    xai_array tile_outscale;
+    XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, outScale_ptr);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->dst_dim3_size * 2);
+    XAI_ARRAY_SET_DATA_PTR(&tile_outscale, outScale_ptr);
+    XAI_ARRAY_SET_WIDTH(&tile_outscale, config->dst_dim3_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1);
+    XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16);
+    XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Output Tile Descriptor
+    // ========================================================================
+    xai_tile3D tile_output;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    // If pre-strided, override stride to 1 → dispatch to 1x1j1d1 (no gather)
+    if (need_prestride) {
+        XAI_CNN_CONV_SET_STRIDEX(&params, 1);
+        XAI_CNN_CONV_SET_STRIDEY(&params, 1);
+    } else {
+        XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+        XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+    }
+
+    // ========================================================================
+    // Execute convolution using generic system-memory API
+    // This version accesses data through the processor cache
+    // ========================================================================
+    XAI_ERR_TYPE status = xaiConvolvedVQ3D(&tile_input, &tile_coeff, &tile_bias, 
+                                            &tile_outscale, &tile_output, &params);
+
+    // Writeback output from cache to system memory for DMA coherency
+    xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size);
+
+    return status;
+}
+
+// ============================================================================
+// Non-VQ (per-tensor output scaling) versions
+// ============================================================================
+
+// Non-VQ DMA version - per-tensor output scaling
+XAI_ERR_TYPE conv_exec_1x1j2d1(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // SECTION 1: DRAM Buffer Allocation (no outscale buffer needed)
+    // ========================================================================
+    int dram0_used = 0;
+    int dram1_used = 0;
+    
+    int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, 
+                                              config->input_ping_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, 
+                                              config->input_pong_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, 
+                                             config->coeff_dram, 
+                                             &dram0_used, &dram1_used);
+    int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, 
+                                               config->output_ping_dram, 
+                                               &dram0_used, &dram1_used);
+    int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, 
+                                               config->output_pong_dram, 
+                                               &dram0_used, &dram1_used);
+    int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, 
+                                            config->bias_dram, 
+                                            &dram0_used, &dram1_used);
+    
+    if (!p_input0 || !p_input1 || !p_coeff || 
+        !p_output0 || !p_output1 || !p_bias) {
+        return (-1);
+    }
+    
+    // ========================================================================
+    // SECTION 2: Initialize XAI Tile Descriptors
+    // ========================================================================
+    xai_tile3D tile_input;
+    xai_size3D frame_size_input;
+    xai_tile4D tile_coeff;
+    xai_array tile_bias;
+    xai_tile3D tile_output;
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    
+    /* Initialize DMA engines */
+    dma_3dm_init(1);
+    dma_2dm_init(0);
+    
+    // Transfer constant data (no outscale)
+    dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size);
+    dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size);
+    
+    // Initialize input buffer and load first tile
+    _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size);
+    
+    // First DMA
+    dma_3dm(1,
+            (void*)src,
+            (void*)&p_input0[config->in_data_offset],
+            config->src_dim1_pitch,
+            config->in_dim1_pitch,
+            config->src_dim2_pitch,
+            config->in_dim2_pitch,
+            config->src_dim1_size,
+            config->in_rows_firstdma,
+            config->src_dim3_size);
+    
+    // Wait for all initial DMA transfers to complete
+    idma_hw_wait_all(0);  // coeff + bias on ch0
+    idma_hw_wait_all(1);  // input on ch1
+    
+    // ========================================================================
+    // Configure Input Tile Descriptor
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);
+    XAI_TILE3D_SET_DIM2(&tile_input, config->in_rows_firstdma - config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, config->in_dim3_edge1);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, config->in_dim3_edge2);
+    
+    frame_size_input.dim1Size = config->src_dim1_size;
+    frame_size_input.dim2Size = config->src_dim2_size;
+    frame_size_input.dim3Size = config->src_dim3_size;
+    
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor
+    // ========================================================================
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Bias Array
+    // ========================================================================
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Output Tile Descriptor
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->output_rows);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->n_tile_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+    
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+    
+    // ========================================================================
+    // SECTION 3: Tiled Execution Loop
+    // ========================================================================
+    int last_tile = 1;
+    
+    for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) {
+        int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1);
+        int current_n_size = (idx_n < config->n_tiles - 1) ? 
+                             config->n_tile_size : config->n_tile_size_last;
+        
+        // Update coefficient/bias for N-tile (no outscale)
+        XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n);
+        XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size);
+        
+        XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]);
+        XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size);
+        XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size);
+        
+        XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n);
+        XAI_TILE3D_SET_DIM3(&tile_output, current_n_size);
+        
+        for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) {
+            int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1);
+            
+            int current_output_rows = (idx_h < config->height_tiles - 1) ? 
+                                      config->output_rows : 
+                                      (config->dst_dim2_size - (config->output_rows * idx_h));
+            
+            // ================================================================
+            // Prefetch Next Input Tile
+            // ================================================================
+            if (!last_h_tile) {
+                int temp_idx_h;
+                inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1);
+                _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size);
+
+                dma_3dm(1,
+                        &(src[(config->stride_y * config->output_rows * temp_idx_h) * config->src_dim1_size]),
+                        &(p_input1[0]),
+                        config->src_dim1_pitch,
+                        config->in_dim1_pitch,
+                        config->src_dim2_pitch,
+                        config->in_dim2_pitch,
+                        config->src_dim1_size,
+                        config->input_rows,
+                        config->src_dim3_size);
+            }
+            
+            // ================================================================
+            // Update Tile Descriptors
+            // ================================================================
+            XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+            XAI_TILE3D_SET_DATA_PTR(&tile_input, &(p_input0[0]));
+            XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->output_rows)*(idx_h));
+            XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1);
+            XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0]));
+            XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->output_rows)*(idx_h));
+            XAI_TILE3D_SET_DIM2(&tile_output, current_output_rows);
+            
+            // ================================================================
+            // Perform Convolution (non-VQ API)
+            // ================================================================
+            XAI_ERR_TYPE status = xaiConvolved3D_S_1x1j2d1_S8S8IX_MOW_WHD(
+                                        &(tile_input),
+                                        &(tile_coeff),
+                                        &(tile_bias),
+                                        &(tile_output),
+                                        &(params));
+            
+            if (status != XAI_ERR_OK) {
+                return status;
+            }
+
+            // Prefetch next coefficient tile
+            if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) {
+                int temp_idx_n;
+                int temp_idx_h;
+                inc_iter_to_temp(&(temp_idx_n), idx_n, config->n_tiles, inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1));
+                dma_1dm(0, (coeff_ptr + (config->coeff_buffer_size * temp_idx_n)), &(p_coeff[0]), (((temp_idx_n) < (config->n_tiles-1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last)));
+            }
+
+            // ================================================================
+            // Write Output Tile to System Memory
+            // ================================================================
+            int output_row_bytes = config->out_dim1_pitch * current_output_rows;
+            dma_2dm(0,
+                    &(p_output1[0]),
+                    &dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n))+((config->out_dim1_pitch * config->output_rows)*(idx_h))],
+                    config->out_dim2_pitch,
+                    config->dst_dim2_pitch,
+                    output_row_bytes,
+                    current_n_size);
+
+            swap_buffers(&(p_output0), &(p_output1));
+            swap_buffers(&(p_input0), &(p_input1));
+        }
+    }
+    
+    // Wait for final output DMA to complete before returning
+    idma_hw_wait_all(0);
+
+    return XAI_ERR_OK;
+}
+
+// Non-VQ cache version - per-tensor output scaling
+XAI_ERR_TYPE conv_exec_1x1j2d1_cache(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // Setup source raw tile descriptor (points to raw input without padding)
+    // ========================================================================
+    xai_tile3D src_raw;
+    XAI_TILE3D_SET_BUFF_PTR(&src_raw, src);
+    XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&src_raw, src);
+    XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0);
+
+    // ========================================================================
+    // Get padded input buffer from allocator (SIMD-aligned dim1_pitch required)
+    // ========================================================================
+    int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2;
+    int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1);
+    int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2;
+    int dim2_pitch = dim1_pitch * padded_dim2;
+    int input_buffer_size = dim2_pitch * config->src_dim3_size;
+
+    int8_t* padded_input = get_cache_padded_input();
+
+    if (input_buffer_size > (int)get_cache_padded_input_size()) {
+        return XAI_ERR_DATASIZE;
+    }
+
+    memset(padded_input, config->input_zero_point, input_buffer_size);
+
+    // ========================================================================
+    // Setup padded input tile descriptor
+    // ========================================================================
+    int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1;
+
+    xai_tile3D tile_input;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0);
+
+    // Copy raw input to padded buffer and extend edges
+    xaiCopyTile3D(&src_raw, &tile_input, true);
+
+    xai_size3D frame_size;
+    frame_size.dim1Size = config->dst_dim1_size * config->stride_x;
+    frame_size.dim2Size = config->dst_dim2_size * config->stride_y;
+    frame_size.dim3Size = config->src_dim3_size;
+
+    xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size);
+
+    // ========================================================================
+    // Pre-subsample input when numInCh > 64 (gather instruction workaround)
+    // The XAI 1x1j2d1 kernel uses gather instructions for numInCh > 64.
+    // Gather only works on local DRAM, not system memory (cache mode).
+    // Fix: apply stride manually, then call with stride=1 → dispatches to
+    // 1x1j1d1 kernel which has an aligned path that uses regular loads.
+    // ========================================================================
+    int need_prestride = (config->src_dim3_size > 2 * XCHAL_IVPN_SIMD_WIDTH)
+                         && (config->stride_x > 1);
+    if (need_prestride) {
+        int pre_outW = config->dst_dim1_size;
+        int pre_outH = config->dst_dim2_size;
+        int pre_d1_pitch = (pre_outW + 2*XCHAL_IVPN_SIMD_WIDTH - 1)
+                           & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1);
+        int pre_d2_pitch = pre_d1_pitch * pre_outH;
+        int pre_buf_size = pre_d2_pitch * config->src_dim3_size;
+        int stride = config->stride_x;
+
+        // Place pre-subsampled data after padded input (128-byte aligned)
+        int pre_offset = (input_buffer_size + 127) & ~127;
+        int8_t* pre_input = &padded_input[pre_offset];
+
+        if ((pre_offset + pre_buf_size) > (int)get_cache_padded_input_size()) {
+            return XAI_ERR_DATASIZE;
+        }
+
+        memset(pre_input, config->input_zero_point, pre_buf_size);
+
+        // Subsample: pick every stride-th pixel in W and H
+        int8_t* orig = &padded_input[data_offset];
+        for (int d = 0; d < config->src_dim3_size; d++) {
+            for (int oy = 0; oy < pre_outH; oy++) {
+                for (int ox = 0; ox < pre_outW; ox++) {
+                    pre_input[d * pre_d2_pitch + oy * pre_d1_pitch + ox] =
+                        orig[d * dim2_pitch + (stride * oy) * dim1_pitch + stride * ox];
+                }
+            }
+        }
+
+        // Update tile_input to point to pre-subsampled data
+        XAI_TILE3D_SET_BUFF_PTR(&tile_input, pre_input);
+        XAI_TILE3D_SET_BUFF_SIZE(&tile_input, pre_buf_size);
+        XAI_TILE3D_SET_DATA_PTR(&tile_input, pre_input);
+        XAI_TILE3D_SET_DIM1_PITCH(&tile_input, pre_d1_pitch);
+        XAI_TILE3D_SET_DIM2_PITCH(&tile_input, pre_d2_pitch);
+        XAI_TILE3D_SET_DIM1(&tile_input, pre_outW);
+        XAI_TILE3D_SET_DIM2(&tile_input, pre_outH);
+        XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, 0);
+        XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, 0);
+        XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, 0);
+        XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, 0);
+    }
+
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor
+    // ========================================================================
+    xai_tile4D tile_coeff;
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr);    
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);   
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Bias Array
+    // ========================================================================
+    xai_array tile_bias;
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr);   
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Output Tile Descriptor
+    // ========================================================================
+    xai_tile3D tile_output;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    // If pre-strided, override stride to 1 → dispatch to 1x1j1d1 (no gather)
+    if (need_prestride) {
+        XAI_CNN_CONV_SET_STRIDEX(&params, 1);
+        XAI_CNN_CONV_SET_STRIDEY(&params, 1);
+    } else {
+        XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+        XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+    }
+
+    // ========================================================================
+    // Execute convolution (non-VQ API)
+    // ========================================================================
+    XAI_ERR_TYPE status = xaiConvolved3D(&tile_input, &tile_coeff, &tile_bias, 
+                                          &tile_output, &params);
+
+    // Writeback output from cache to system memory for DMA coherency
+    xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size);
+
+    return status;
+}
\ No newline at end of file
diff --git a/backends/cadence/vision/operators/conv/conv_exec_3x3j1d1.c b/backends/cadence/vision/operators/conv/conv_exec_3x3j1d1.c
new file mode 100644
index 00000000000..381e7ead66c
--- /dev/null
+++ b/backends/cadence/vision/operators/conv/conv_exec_3x3j1d1.c
@@ -0,0 +1,1030 @@
+#include "kernel_executors.h"
+#include "memory_manager.h"
+#include "dma.h"
+#include "utils.h"
+#include <xai_cnn_api.h>
+#include <string.h>
+#include <xtensa/hal.h>
+
+// VQ (per-channel output scaling) DMA version
+XAI_ERR_TYPE conv_exec_3x3j1d1VQ(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    int8_t* outScale_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // SECTION 1: DRAM Buffer Allocation
+    // ========================================================================
+    int dram0_used = 0;
+    int dram1_used = 0;
+    
+    // FIX: Allocate coeff FIRST to avoid address-sensitive FOLD16 bug.
+    // See FUNCTIONALITY_FIXES.md §2 for details.
+    int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, 
+                                             config->coeff_dram, 
+                                             &dram0_used, &dram1_used);
+    int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, 
+                                              config->input_ping_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, 
+                                              config->input_pong_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, 
+                                               config->output_ping_dram, 
+                                               &dram0_used, &dram1_used);
+    int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, 
+                                               config->output_pong_dram, 
+                                               &dram0_used, &dram1_used);
+    int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, 
+                                            config->bias_dram, 
+                                            &dram0_used, &dram1_used);
+    int8_t* p_outscale = allocate_dram_buffer(config->outscale_buffer_size, 
+                                                config->outscale_dram, 
+                                                &dram0_used, &dram1_used);
+    
+    if (!p_input0 || !p_input1 || !p_coeff || 
+        !p_output0 || !p_output1 || !p_bias || !p_outscale) {
+        return (-1);
+    }
+    
+    // ========================================================================
+    // SECTION 2: Initialize XAI Tile Descriptors
+    // ========================================================================
+    xai_tile3D tile_input;
+    xai_size3D frame_size_input;
+    xai_tile4D tile_coeff;
+    xai_array tile_bias;
+    xai_array tile_outscale;
+    xai_tile3D tile_output;
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    
+    // Transfer constant data (all buffers are 64-byte aligned by test harness)
+    dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size);
+    dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size);
+    dma_1dm(0, outScale_ptr, p_outscale, config->outscale_buffer_size);
+    
+    // Initialize input buffer and load first tile
+    _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size);
+    
+    // First DMA: load IN_ROWS_FIRSTDMA rows at data offset
+    dma_3dm(1,
+            (void*)src,
+            (void*)&p_input0[config->in_data_offset],
+            config->src_dim1_pitch,
+            config->in_dim1_pitch,
+            config->src_dim2_pitch,
+            config->in_dim2_pitch,
+            config->src_dim1_size,
+            config->in_rows_firstdma,
+            config->src_dim3_size);
+    
+    // Wait for all initial DMA transfers to complete
+    idma_hw_wait_all(0);  // coeff + bias + outscale on ch0
+    idma_hw_wait_all(1);  // input on ch1
+    
+    // ========================================================================
+    // Configure Input Tile Descriptor
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);
+    XAI_TILE3D_SET_DIM2(&tile_input, (config->in_rows_firstdma - config->in_dim2_edge1));
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, config->in_dim3_edge1);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, config->in_dim3_edge2);
+    
+    // Frame size for edge extension
+    frame_size_input.dim1Size = config->src_dim1_size;
+    frame_size_input.dim2Size = config->src_dim2_size;
+    frame_size_input.dim3Size = config->src_dim3_size;
+    
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor (4D: W×H×C×N)
+    // ========================================================================
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Bias Array
+    // ========================================================================
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Output Scale Array
+    // ========================================================================
+    XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, p_outscale);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->outscale_buffer_size);
+    XAI_ARRAY_SET_DATA_PTR(&tile_outscale, p_outscale);
+    XAI_ARRAY_SET_WIDTH(&tile_outscale, config->n_tile_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1);
+    XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16);
+    XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Output Tile Descriptor
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->output_rows);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->n_tile_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+    
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+    
+    // ========================================================================
+    // SECTION 3: Tiled Execution Loop (N-tiles × H-tiles)
+    // ========================================================================
+    int last_tile = 1;
+    
+    for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) {
+        int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1);
+        int current_n_size = (idx_n < config->n_tiles - 1) ? 
+                             config->n_tile_size : config->n_tile_size_last;
+        
+        // Update coefficient/bias/outscale for N-tile (matching convIdma.c line 649)
+        XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n);
+        XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size);
+        
+        XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]);
+        XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size);
+        XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size);
+        
+        XAI_ARRAY_SET_DATA_PTR(&tile_outscale, &p_outscale[config->n_tile_size * 2 * idx_n]);
+        XAI_ARRAY_SET_WIDTH(&tile_outscale, current_n_size);
+        XAI_ARRAY_SET_CAPACITY(&tile_outscale, current_n_size);
+        
+        XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n);
+        XAI_TILE3D_SET_DIM3(&tile_output, current_n_size);
+        
+        // Process vertical tiles (matching convIdma.c lines 664-728)
+        for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) {
+            int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1);
+            
+            // Calculate actual rows for this height tile (handle last tile edge case)
+            int current_output_rows = (idx_h < config->height_tiles - 1) ? 
+                                      config->output_rows : 
+                                      (config->dst_dim2_size - (config->output_rows * idx_h));
+            
+            // ================================================================
+            // Prefetch Next Input Tile (Ping-Pong Buffering)
+            // ================================================================
+            if (!last_h_tile) {
+                int temp_idx_h;
+                inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1);
+                _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size);
+
+                dma_3dm(  /*ch	*/				1,
+                          /*src	*/				(void*)&(src[max(((config->stride_y * config->output_rows * temp_idx_h - config->in_dim2_edge1) * config->src_dim1_size),0)]),
+                          /*dst	*/				(void*)&(p_input1[max((((-(config->output_rows)* config->in_dim1_pitch))*(temp_idx_h))+(config->in_data_offset),1)]),
+                          /*src_row_pitch	*/  config->src_dim1_pitch,
+                          /*dst_row_pitch	*/	config->in_dim1_pitch,
+                          /*src_tile_pitch	*/  config->src_dim2_pitch,
+                          /*dst_tile_pitch	*/  config->in_dim2_pitch,
+                          /*row_sz	*/			config->src_dim1_size,
+                          /*nrows	*/			min(((config->stride_y * config->output_rows)*(temp_idx_h))+(config->input_rows-config->in_dim2_edge1),min((-(config->stride_y * config->output_rows))*(temp_idx_h)+(config->src_dim2_size + config->in_dim2_edge2),config->input_rows)),
+                          /*ntiles	*/			config->src_dim3_size);
+            }
+            
+            // ================================================================
+            // Update Tile Descriptors for Current Iteration (matching convIdma.c lines 694-700)
+            // ================================================================
+            XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+            XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]);
+            XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->output_rows)*(idx_h));
+            // Note: DIM2 stays constant at (in_rows_firstdma - edge1) - set once above
+            XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1);
+            XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0]));
+            XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->output_rows)*(idx_h));
+            XAI_TILE3D_SET_DIM2(&tile_output, current_output_rows);
+            
+            // ================================================================
+            // Perform Edge Extension and Convolution
+            // ================================================================
+            xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size_input);
+            
+            XAI_ERR_TYPE status = xaiConvolvedVQ3D_S_3x3j1d1_S8S8IX_MOW_WHD(&(tile_input),
+            		&(tile_coeff),
+					&(tile_bias), &(tile_outscale),
+					&(tile_output), &(params));
+            
+            if (status != XAI_ERR_OK) {
+                return status;
+            }
+
+            // ================================================================
+            // Prefetch next coefficient tile (if needed)
+            // ================================================================
+            if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) {
+                int temp_idx_n;
+                int temp_idx_h;
+
+                inc_iter_to_temp(&(temp_idx_n),idx_n, config->n_tiles , inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1));
+
+				dma_1dm(0,/* src */ (((coeff_ptr) + ((config->coeff_buffer_size)*(temp_idx_n)))), /* dst */ &(p_coeff[0]), /* row size */ (((temp_idx_n) < (config->n_tiles-1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last)));
+            }
+            
+            // ================================================================
+            // Write Output Tile to System Memory (matching convIdma.c lines 718-724)
+            // ================================================================
+            // Calculate actual output bytes for this height tile
+            // Last height tile may have fewer rows than output_rows
+            int output_row_bytes = config->out_dim1_pitch * current_output_rows;
+            
+            dma_2dm(						0,
+            		/* src */ 				&(p_output1[0]),
+					/* dst */ 				&dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n))+((config->out_dim1_pitch * config->output_rows)*(idx_h))],
+					/* src stride 2d */ 	config->out_dim2_pitch,
+					/* dst stride 2d */ 	config->dst_dim2_pitch,
+					/* row size */ 			output_row_bytes,
+					/* count 2d */ 			current_n_size);
+
+            swap_buffers(&(p_output0), &(p_output1));
+            swap_buffers(&(p_input0), &(p_input1));
+        }
+    }
+    
+    // Wait for final output DMA to complete before returning
+    idma_hw_wait_all(0);
+
+    return XAI_ERR_OK;
+}
+
+// VQ (per-channel output scaling) cache version
+// All data stays in system memory and is accessed through processor cache
+XAI_ERR_TYPE conv_exec_3x3j1d1VQ_cache(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    int8_t* outScale_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // Setup source raw tile descriptor (points to raw input without padding)
+    // ========================================================================
+    xai_tile3D src_raw;
+    XAI_TILE3D_SET_BUFF_PTR(&src_raw, src);
+    XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&src_raw, src);
+    XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0);
+
+    // ========================================================================
+    // Get padded input buffer from allocator (shared across cache kernels)
+    // ========================================================================
+    int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2;
+    int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1);
+    int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2;
+    int dim2_pitch = dim1_pitch * padded_dim2;
+    int input_buffer_size = dim2_pitch * config->src_dim3_size;
+    
+    // Get shared padded input buffer from allocator
+    int8_t* padded_input = get_cache_padded_input();
+    
+    if (input_buffer_size > (int)get_cache_padded_input_size()) {
+        return XAI_ERR_DATASIZE;
+    }
+    
+    // Zero-fill the padded buffer
+    memset(padded_input, config->input_zero_point, input_buffer_size);
+
+    // ========================================================================
+    // Setup padded input tile descriptor
+    // ========================================================================
+    int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1;
+    
+    xai_tile3D tile_input;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0);
+
+    // ========================================================================
+    // Copy raw input to padded buffer and extend edges
+    // ========================================================================
+#ifdef USE_DMA_FOR_CACHE_COPY
+    // Use DMA 3D transfer to copy input data into padded buffer at data_offset
+    dma_3dm(0,
+            /* src */           src,
+            /* dst */           &padded_input[data_offset],
+            /* src_row_pitch */ config->src_dim1_pitch,
+            /* dst_row_pitch */ dim1_pitch,
+            /* src_tile_pitch */ config->src_dim2_pitch,
+            /* dst_tile_pitch */ dim2_pitch,
+            /* row_sz */        config->src_dim1_size,
+            /* nrows */         config->src_dim2_size,
+            /* ntiles */        config->src_dim3_size);
+#else
+    // Use library tile copy function (no DMA required)
+    // Safe manual copy: avoids SIMD overread near source buffer boundary
+    for (int d = 0; d < config->src_dim3_size; d++) {
+        for (int h = 0; h < config->src_dim2_size; h++) {
+            memcpy(&padded_input[data_offset + d * dim2_pitch + h * dim1_pitch],
+                   &src[d * config->src_dim2_pitch + h * config->src_dim1_pitch],
+                   config->src_dim1_size);
+        }
+    }
+    (void)src_raw;
+#endif
+    
+    xai_size3D frame_size;
+    frame_size.dim1Size = config->dst_dim1_size * config->stride_x;
+    frame_size.dim2Size = config->dst_dim2_size * config->stride_y;
+    frame_size.dim3Size = config->src_dim3_size;
+    
+    xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size);
+
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor (4D: W×H×C×N)  
+    // ========================================================================
+    xai_tile4D tile_coeff;
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr);    
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);   
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Bias Array 
+    // ========================================================================
+    xai_array tile_bias;
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr);   
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Output Scale Array
+    // ========================================================================
+    xai_array tile_outscale;
+    XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, outScale_ptr);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->dst_dim3_size * 2);
+    XAI_ARRAY_SET_DATA_PTR(&tile_outscale, outScale_ptr);
+    XAI_ARRAY_SET_WIDTH(&tile_outscale, config->dst_dim3_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1);
+    XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16);
+    XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Output Tile Descriptor (points to system memory)
+    // ========================================================================
+    xai_tile3D tile_output;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+
+    // ========================================================================
+    // Execute convolution using generic system-memory API
+    // This version accesses data through the processor cache
+    // ========================================================================
+    XAI_ERR_TYPE status = xaiConvolvedVQ3D(&tile_input, &tile_coeff, &tile_bias, 
+                                            &tile_outscale, &tile_output, &params);
+
+    // Writeback output from cache to system memory for DMA coherency
+    xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size);
+
+    return status;
+}
+
+// ============================================================================
+// Non-VQ (per-tensor output scaling) versions
+// ============================================================================
+
+// Non-VQ DMA version - per-tensor output scaling
+XAI_ERR_TYPE conv_exec_3x3j1d1(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // SECTION 1: DRAM Buffer Allocation (no outscale buffer needed)
+    // ========================================================================
+    int dram0_used = 0;
+    int dram1_used = 0;
+    
+    // FIX: Allocate coeff FIRST to avoid address-sensitive FOLD16 bug.
+    // See FUNCTIONALITY_FIXES.md §2 for details.
+    int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, 
+                                             config->coeff_dram, 
+                                             &dram0_used, &dram1_used);
+    int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, 
+                                              config->input_ping_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, 
+                                              config->input_pong_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, 
+                                               config->output_ping_dram, 
+                                               &dram0_used, &dram1_used);
+    int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, 
+                                               config->output_pong_dram, 
+                                               &dram0_used, &dram1_used);
+    int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, 
+                                            config->bias_dram, 
+                                            &dram0_used, &dram1_used);
+    
+    if (!p_input0 || !p_input1 || !p_coeff || 
+        !p_output0 || !p_output1 || !p_bias) {
+        return (-1);
+    }
+
+    // ========================================================================
+    // SECTION 2: Initialize XAI Tile Descriptors
+    // ========================================================================
+    xai_tile3D tile_input;
+    xai_size3D frame_size_input;
+    xai_tile4D tile_coeff;
+    xai_array tile_bias;
+    xai_tile3D tile_output;
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    
+    /* Initialize DMA engines */
+    dma_3dm_init(1);
+    dma_2dm_init(0);
+    
+    // Transfer constant data (no outscale)
+    dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size);
+    dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size);
+    
+    // Initialize input buffer and load first tile
+    _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size);
+    
+    // First DMA: load IN_ROWS_FIRSTDMA rows at data offset
+    dma_3dm(1,
+            (void*)src,
+            (void*)&p_input0[config->in_data_offset],
+            config->src_dim1_pitch,
+            config->in_dim1_pitch,
+            config->src_dim2_pitch,
+            config->in_dim2_pitch,
+            config->src_dim1_size,
+            config->in_rows_firstdma,
+            config->src_dim3_size);
+    
+    // Wait for all initial DMA transfers to complete
+    idma_hw_wait_all(0);  // coeff + bias on ch0
+    idma_hw_wait_all(1);  // input on ch1
+    
+    // ========================================================================
+    // Configure Input Tile Descriptor
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);
+    XAI_TILE3D_SET_DIM2(&tile_input, (config->in_rows_firstdma - config->in_dim2_edge1));
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, config->in_dim3_edge1);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, config->in_dim3_edge2);
+    
+    // Frame size for edge extension
+    frame_size_input.dim1Size = config->src_dim1_size;
+    frame_size_input.dim2Size = config->src_dim2_size;
+    frame_size_input.dim3Size = config->src_dim3_size;
+    
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor (4D: W×H×C×N)
+    // ========================================================================
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Bias Array
+    // ========================================================================
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Output Tile Descriptor
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->output_rows);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->n_tile_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+    
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+    
+    // ========================================================================
+    // SECTION 3: Tiled Execution Loop (N-tiles × H-tiles)
+    // ========================================================================
+    int last_tile = 1;
+    
+    for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) {
+        int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1);
+        int current_n_size = (idx_n < config->n_tiles - 1) ? 
+                             config->n_tile_size : config->n_tile_size_last;
+        
+        // Update coefficient/bias for N-tile (no outscale)
+        XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n);
+        XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size);
+        
+        XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]);
+        XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size);
+        XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size);
+        
+        XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n);
+        XAI_TILE3D_SET_DIM3(&tile_output, current_n_size);
+        
+        // Process vertical tiles
+        for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) {
+            int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1);
+            
+            // Calculate actual rows for this height tile
+            int current_output_rows = (idx_h < config->height_tiles - 1) ? 
+                                      config->output_rows : 
+                                      (config->dst_dim2_size - (config->output_rows * idx_h));
+            
+            // ================================================================
+            // Prefetch Next Input Tile (Ping-Pong Buffering)
+            // ================================================================
+            if (!last_h_tile) {
+                int temp_idx_h;
+                inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1);
+                _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size);
+
+                dma_3dm(1,
+                        (void*)&(src[max(((config->stride_y * config->output_rows * temp_idx_h - config->in_dim2_edge1) * config->src_dim1_size),0)]),
+                        (void*)&(p_input1[max((((-(config->output_rows)* config->in_dim1_pitch))*(temp_idx_h))+(config->in_data_offset),1)]),
+                        config->src_dim1_pitch,
+                        config->in_dim1_pitch,
+                        config->src_dim2_pitch,
+                        config->in_dim2_pitch,
+                        config->src_dim1_size,
+                        min(((config->stride_y * config->output_rows)*(temp_idx_h))+(config->input_rows-config->in_dim2_edge1),min((-(config->stride_y * config->output_rows))*(temp_idx_h)+(config->src_dim2_size + config->in_dim2_edge2),config->input_rows)),
+                        config->src_dim3_size);
+            }
+            
+            // ================================================================
+            // Update Tile Descriptors for Current Iteration
+            // ================================================================
+            XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+            XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]);
+            XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->output_rows)*(idx_h));
+            XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1);
+            XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0]));
+            XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->output_rows)*(idx_h));
+            XAI_TILE3D_SET_DIM2(&tile_output, current_output_rows);
+            
+            // ================================================================
+            // Perform Edge Extension and Convolution (non-VQ API)
+            // ================================================================
+            xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size_input);
+            
+            XAI_ERR_TYPE status = xaiConvolved3D_S_3x3j1d1_S8S8IX_MOW_WHD(
+                                        &(tile_input),
+                                        &(tile_coeff),
+                                        &(tile_bias),
+                                        &(tile_output),
+                                        &(params));
+            
+            if (status != XAI_ERR_OK) {
+                return status;
+            }
+
+            // ================================================================
+            // Prefetch next coefficient tile (if needed)
+            // ================================================================
+            if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) {
+                int temp_idx_n;
+                int temp_idx_h;
+                inc_iter_to_temp(&(temp_idx_n),idx_n, config->n_tiles, inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1));
+                dma_1dm(0, (coeff_ptr + (config->coeff_buffer_size * temp_idx_n)), &(p_coeff[0]), (((temp_idx_n) < (config->n_tiles-1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last)));
+            }
+            
+            // ================================================================
+            // Write Output Tile to System Memory
+            // ================================================================
+            int output_row_bytes = config->out_dim1_pitch * current_output_rows;
+            dma_2dm(0,
+                    &(p_output1[0]),
+                    &dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n))+((config->out_dim1_pitch * config->output_rows)*(idx_h))],
+                    config->out_dim2_pitch,
+                    config->dst_dim2_pitch,
+                    output_row_bytes,
+                    current_n_size);
+
+            swap_buffers(&(p_output0), &(p_output1));
+            swap_buffers(&(p_input0), &(p_input1));
+        }
+    }
+    
+    // Wait for final output DMA to complete before returning
+    idma_hw_wait_all(0);
+
+    return XAI_ERR_OK;
+}
+
+// Non-VQ cache version - per-tensor output scaling
+XAI_ERR_TYPE conv_exec_3x3j1d1_cache(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // Setup source raw tile descriptor
+    // ========================================================================
+    xai_tile3D src_raw;
+    XAI_TILE3D_SET_BUFF_PTR(&src_raw, src);
+    XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&src_raw, src);
+    XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0);
+
+    // ========================================================================
+    // Get padded input buffer from allocator
+    // ========================================================================
+    int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2;
+    int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1);
+    int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2;
+    int dim2_pitch = dim1_pitch * padded_dim2;
+    int input_buffer_size = dim2_pitch * config->src_dim3_size;
+    
+    int8_t* padded_input = get_cache_padded_input();
+    
+    if (input_buffer_size > (int)get_cache_padded_input_size()) {
+        return XAI_ERR_DATASIZE;
+    }
+    
+    memset(padded_input, config->input_zero_point, input_buffer_size);
+
+    // ========================================================================
+    // Setup padded input tile descriptor
+    // ========================================================================
+    int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1;
+    
+    xai_tile3D tile_input;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0);
+
+    // Copy raw input to padded buffer
+    // Safe manual copy: avoids SIMD overread near source buffer boundary
+    for (int d = 0; d < config->src_dim3_size; d++) {
+        for (int h = 0; h < config->src_dim2_size; h++) {
+            memcpy(&padded_input[data_offset + d * dim2_pitch + h * dim1_pitch],
+                   &src[d * config->src_dim2_pitch + h * config->src_dim1_pitch],
+                   config->src_dim1_size);
+        }
+    }
+    (void)src_raw;
+    
+    xai_size3D frame_size;
+    frame_size.dim1Size = config->dst_dim1_size * config->stride_x;
+    frame_size.dim2Size = config->dst_dim2_size * config->stride_y;
+    frame_size.dim3Size = config->src_dim3_size;
+    
+    xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size);
+
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor
+    // ========================================================================
+    xai_tile4D tile_coeff;
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr);    
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);   
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Bias Array
+    // ========================================================================
+    xai_array tile_bias;
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr);   
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Output Tile Descriptor
+    // ========================================================================
+    xai_tile3D tile_output;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+
+    // ========================================================================
+    // Execute convolution (non-VQ API)
+    // ========================================================================
+    XAI_ERR_TYPE status = xaiConvolved3D(&tile_input, &tile_coeff, &tile_bias, 
+                                          &tile_output, &params);
+
+    // Writeback output from cache to system memory for DMA coherency
+    xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size);
+
+    return status;
+}
\ No newline at end of file
diff --git a/backends/cadence/vision/operators/conv/conv_exec_3x3j2d1.c b/backends/cadence/vision/operators/conv/conv_exec_3x3j2d1.c
new file mode 100644
index 00000000000..62f1fcb18a4
--- /dev/null
+++ b/backends/cadence/vision/operators/conv/conv_exec_3x3j2d1.c
@@ -0,0 +1,1028 @@
+#include "kernel_executors.h"
+#include "memory_manager.h"
+#include "dma.h"
+#include "utils.h"
+#include <xai_cnn_api.h>
+#include <string.h>
+#include <xtensa/hal.h>
+
+// VQ (per-channel output scaling) DMA version
+XAI_ERR_TYPE conv_exec_3x3j2d1VQ(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    int8_t* outScale_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // SECTION 1: DRAM Buffer Allocation
+    // ========================================================================
+    int dram0_used = 0;
+    int dram1_used = 0;
+    
+    // FIX: Allocate coeff FIRST to avoid address-sensitive FOLD16 bug.
+    // See FUNCTIONALITY_FIXES.md §2 for details.
+    int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, 
+                                             config->coeff_dram, 
+                                             &dram0_used, &dram1_used);
+    int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, 
+                                              config->input_ping_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, 
+                                              config->input_pong_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, 
+                                               config->output_ping_dram, 
+                                               &dram0_used, &dram1_used);
+    int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, 
+                                               config->output_pong_dram, 
+                                               &dram0_used, &dram1_used);
+    int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, 
+                                            config->bias_dram, 
+                                            &dram0_used, &dram1_used);
+    int8_t* p_outscale = allocate_dram_buffer(config->outscale_buffer_size, 
+                                                config->outscale_dram, 
+                                                &dram0_used, &dram1_used);
+    
+    if (!p_input0 || !p_input1 || !p_coeff || 
+        !p_output0 || !p_output1 || !p_bias || !p_outscale) {
+        return (-1);
+    }
+    
+    // ========================================================================
+    // SECTION 2: Initialize XAI Tile Descriptors
+    // ========================================================================
+    xai_tile3D tile_input;
+    xai_size3D frame_size_input;
+    xai_tile4D tile_coeff;
+    xai_array tile_bias;
+    xai_array tile_outscale;
+    xai_tile3D tile_output;
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    
+    /* Initialize DMA engines */
+    dma_3dm_init(1);
+    dma_2dm_init(0);
+    
+    // Transfer constant data (all buffers are 64-byte aligned by test harness)
+    dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size);
+    dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size);
+    dma_1dm(0, outScale_ptr, p_outscale, config->outscale_buffer_size);
+    
+    // Initialize input buffer and load first tile
+    _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size);
+    
+    // First DMA: load IN_ROWS_FIRSTDMA rows at data offset
+    dma_3dm(1,
+            (void*)src,
+            (void*)&p_input0[config->in_data_offset],
+            config->src_dim1_pitch,
+            config->in_dim1_pitch,
+            config->src_dim2_pitch,
+            config->in_dim2_pitch,
+            config->src_dim1_size,
+            config->in_rows_firstdma,
+            config->src_dim3_size);
+    
+    // Wait for all initial DMA transfers to complete
+    idma_hw_wait_all(0);  // coeff + bias + outscale on ch0
+    idma_hw_wait_all(1);  // input on ch1
+    
+    // ========================================================================
+    // Configure Input Tile Descriptor
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);
+    XAI_TILE3D_SET_DIM2(&tile_input, (config->in_rows_firstdma - config->in_dim2_edge1));
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, config->in_dim3_edge1);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, config->in_dim3_edge2);
+    
+    // Frame size for edge extension
+    frame_size_input.dim1Size = config->src_dim1_size;
+    frame_size_input.dim2Size = config->src_dim2_size;
+    frame_size_input.dim3Size = config->src_dim3_size;
+    
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor (4D: W×H×C×N)
+    // ========================================================================
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Bias Array
+    // ========================================================================
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Output Scale Array
+    // ========================================================================
+    XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, p_outscale);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->outscale_buffer_size);
+    XAI_ARRAY_SET_DATA_PTR(&tile_outscale, p_outscale);
+    XAI_ARRAY_SET_WIDTH(&tile_outscale, config->n_tile_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1);
+    XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16);
+    XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Output Tile Descriptor
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->output_rows);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->n_tile_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+    
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+    
+    // ========================================================================
+    // SECTION 3: Tiled Execution Loop (N-tiles × H-tiles)
+    // ========================================================================
+    int last_tile = 1;
+    
+    for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) {
+        int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1);
+        int current_n_size = (idx_n < config->n_tiles - 1) ? 
+                             config->n_tile_size : config->n_tile_size_last;
+        
+        // Update coefficient/bias/outscale for N-tile
+        XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n);
+        XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size);
+        
+        XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]);
+        XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size);
+        XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size);
+        
+        XAI_ARRAY_SET_DATA_PTR(&tile_outscale, &p_outscale[config->n_tile_size * 2 * idx_n]);
+        XAI_ARRAY_SET_WIDTH(&tile_outscale, current_n_size);
+        XAI_ARRAY_SET_CAPACITY(&tile_outscale, current_n_size);
+        
+        XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n);
+        XAI_TILE3D_SET_DIM3(&tile_output, current_n_size);
+        
+        // Process vertical tiles
+        for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) {
+            int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1);
+            
+            // Calculate actual rows for this height tile (handle last tile edge case)
+            int current_output_rows = (idx_h < config->height_tiles - 1) ? 
+                                      config->output_rows : 
+                                      (config->dst_dim2_size - (config->output_rows * idx_h));
+            int current_row_size = config->dst_dim1_size * current_output_rows;
+            
+            // ================================================================
+            // Prefetch Next Input Tile (Ping-Pong Buffering)
+            // ================================================================
+            if (!last_h_tile) {
+                int temp_idx_h;
+                inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1);
+                _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size);
+
+				dma_3dm(  /*ch	*/				1,
+						  /*src	*/				(void*)&(src[max((( config->stride_y * config->output_rows * temp_idx_h - config->in_dim2_edge1) * config->src_dim1_size),0)]),
+						  /*dst	*/				(void*)&(p_input1[max((((-(config->output_rows)* config->in_dim1_pitch))*(temp_idx_h))+(config->in_data_offset),1)]),
+						  /*src_row_pitch	*/  config->src_dim1_pitch,
+						  /*dst_row_pitch	*/	config->in_dim1_pitch,
+						  /*src_tile_pitch	*/  config->src_dim2_pitch,
+						  /*dst_tile_pitch	*/  config->in_dim2_pitch,
+						  /*row_sz	*/			config->src_dim1_size,
+						  /*nrows	*/			min(((config->stride_y * config->output_rows )*(temp_idx_h))+(config->input_rows-config->in_dim2_edge1),min((((-(config->stride_y * config->output_rows )))*(temp_idx_h))+(config->src_dim2_size + config->in_dim2_edge2),config->input_rows)),
+						  /*ntiles	*/			config->src_dim3_size);
+            }
+            
+            // ================================================================
+            // Update Tile Descriptors for Current Iteration
+            // ================================================================
+            XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+            XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]);
+            XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->output_rows)*(idx_h));
+            XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1);
+            XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0]));
+            XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->output_rows)*(idx_h));
+            XAI_TILE3D_SET_DIM2(&tile_output, current_output_rows);
+            
+            // ================================================================
+            // Perform Edge Extension and Convolution
+            // ================================================================
+			xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size_input);
+
+            XAI_ERR_TYPE status =  xaiConvolvedVQ3D_S_3x3j2d1_S8S8IX_MOW_WHD(&(tile_input),
+            		&(tile_coeff),
+					&(tile_bias), &(tile_outscale),
+					&(tile_output), &(params));
+
+            // ================================================================
+            // Prefetch next coefficient tile (if needed)
+            // ================================================================
+            if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) {
+                int temp_idx_n;
+                int temp_idx_h;
+
+                inc_iter_to_temp(&(temp_idx_n),idx_n, config->n_tiles , inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1));
+
+				dma_1dm(0,/* src */ (((coeff_ptr) + ((config->coeff_buffer_size)*(temp_idx_n)))), /* dst */ &(p_coeff[0]), /* row size */ (((temp_idx_n) < (config->n_tiles-1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last)));
+            }
+            
+            // ================================================================
+            // Write Output Tile to System Memory
+            // ================================================================
+            dma_2dm(0,
+            		/* src */ 				&(p_output1[0]),
+					/* dst */ 				&dst[((config->dst_dim2_pitch *config->n_tile_size )*(idx_n))+((config->out_dim2_pitch)*(idx_h))],
+					/* src stride 2d */ 	config->out_dim2_pitch,
+					/* dst stride 2d */ 	config->dst_dim2_pitch,
+					/* row size */ 			current_row_size,
+					/* count 2d */ 			current_n_size);
+
+            swap_buffers(&(p_output0), &(p_output1));
+            swap_buffers(&(p_input0), &(p_input1));
+        }
+    }
+    
+    // Wait for final output DMA to complete before returning
+    idma_hw_wait_all(0);
+
+    return XAI_ERR_OK;
+}
+
+// VQ (per-channel output scaling) cache version
+// All data stays in system memory and is accessed through processor cache
+XAI_ERR_TYPE conv_exec_3x3j2d1VQ_cache(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    int8_t* outScale_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // Setup source raw tile descriptor (points to raw input without padding)
+    // ========================================================================
+    xai_tile3D src_raw;
+    XAI_TILE3D_SET_BUFF_PTR(&src_raw, src);
+    XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&src_raw, src);
+    XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0);
+
+    // ========================================================================
+    // Get padded input buffer from allocator (shared across cache kernels)
+    // ========================================================================
+    int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2;
+    int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1);
+    int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2;
+    int dim2_pitch = dim1_pitch * padded_dim2;
+    int input_buffer_size = dim2_pitch * config->src_dim3_size;
+    
+    // Get shared padded input buffer from allocator
+    int8_t* padded_input = get_cache_padded_input();
+    
+    if (input_buffer_size > (int)get_cache_padded_input_size()) {
+        return XAI_ERR_DATASIZE;
+    }
+    
+    // Zero-fill the padded buffer
+    memset(padded_input, config->input_zero_point, input_buffer_size);
+
+    // ========================================================================
+    // Setup padded input tile descriptor
+    // ========================================================================
+    int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1;
+    
+    xai_tile3D tile_input;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0);
+
+    // ========================================================================
+    // Copy raw input to padded buffer and extend edges
+    // ========================================================================
+#ifdef USE_DMA_FOR_CACHE_COPY
+    // Use DMA 3D transfer to copy input data into padded buffer at data_offset
+    dma_3dm(0,
+            /* src */           src,
+            /* dst */           &padded_input[data_offset],
+            /* src_row_pitch */ config->src_dim1_pitch,
+            /* dst_row_pitch */ dim1_pitch,
+            /* src_tile_pitch */ config->src_dim2_pitch,
+            /* dst_tile_pitch */ dim2_pitch,
+            /* row_sz */        config->src_dim1_size,
+            /* nrows */         config->src_dim2_size,
+            /* ntiles */        config->src_dim3_size);
+#else
+    // Use library tile copy function (no DMA required)
+    // Safe manual copy: avoids SIMD overread near source buffer boundary
+    for (int d = 0; d < config->src_dim3_size; d++) {
+        for (int h = 0; h < config->src_dim2_size; h++) {
+            memcpy(&padded_input[data_offset + d * dim2_pitch + h * dim1_pitch],
+                   &src[d * config->src_dim2_pitch + h * config->src_dim1_pitch],
+                   config->src_dim1_size);
+        }
+    }
+    (void)src_raw;
+#endif
+    
+    xai_size3D frame_size;
+    frame_size.dim1Size = config->dst_dim1_size * config->stride_x;
+    frame_size.dim2Size = config->dst_dim2_size * config->stride_y;
+    frame_size.dim3Size = config->src_dim3_size;
+    
+    xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size);
+
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor (4D: W×H×C×N)  
+    // ========================================================================
+    xai_tile4D tile_coeff;
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr);    
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);   
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Bias Array 
+    // ========================================================================
+    xai_array tile_bias;
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr);   
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Output Scale Array
+    // ========================================================================
+    xai_array tile_outscale;
+    XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, outScale_ptr);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->dst_dim3_size * 2);
+    XAI_ARRAY_SET_DATA_PTR(&tile_outscale, outScale_ptr);
+    XAI_ARRAY_SET_WIDTH(&tile_outscale, config->dst_dim3_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1);
+    XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16);
+    XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Output Tile Descriptor (points to system memory)
+    // ========================================================================
+    xai_tile3D tile_output;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+
+    // ========================================================================
+    // Execute convolution using generic system-memory API
+    // This version accesses data through the processor cache
+    // ========================================================================
+    XAI_ERR_TYPE status = xaiConvolvedVQ3D(&tile_input, &tile_coeff, &tile_bias, 
+                                            &tile_outscale, &tile_output, &params);
+
+    // Writeback output from cache to system memory for DMA coherency
+    xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size);
+
+    return status;
+}
+
+// ============================================================================
+// Non-VQ (per-tensor output scaling) versions
+// ============================================================================
+
+// Non-VQ DMA version - per-tensor output scaling
+XAI_ERR_TYPE conv_exec_3x3j2d1(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // SECTION 1: DRAM Buffer Allocation (no outscale buffer needed)
+    // ========================================================================
+    int dram0_used = 0;
+    int dram1_used = 0;
+    
+    // FIX: Allocate coeff FIRST to avoid address-sensitive FOLD16 bug.
+    // See FUNCTIONALITY_FIXES.md §2 for details.
+    int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, 
+                                             config->coeff_dram, 
+                                             &dram0_used, &dram1_used);
+    int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, 
+                                              config->input_ping_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, 
+                                              config->input_pong_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, 
+                                               config->output_ping_dram, 
+                                               &dram0_used, &dram1_used);
+    int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, 
+                                               config->output_pong_dram, 
+                                               &dram0_used, &dram1_used);
+    int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, 
+                                            config->bias_dram, 
+                                            &dram0_used, &dram1_used);
+    
+    if (!p_input0 || !p_input1 || !p_coeff || 
+        !p_output0 || !p_output1 || !p_bias) {
+        return (-1);
+    }
+    
+    // ========================================================================
+    // SECTION 2: Initialize XAI Tile Descriptors
+    // ========================================================================
+    xai_tile3D tile_input;
+    xai_size3D frame_size_input;
+    xai_tile4D tile_coeff;
+    xai_array tile_bias;
+    xai_tile3D tile_output;
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    
+    /* Initialize DMA engines */
+    dma_3dm_init(1);
+    dma_2dm_init(0);
+    
+    // Transfer constant data (no outscale)
+    dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size);
+    dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size);
+    
+    // Initialize input buffer and load first tile
+    _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size);
+    
+    // First DMA: load IN_ROWS_FIRSTDMA rows at data offset
+    dma_3dm(1,
+            (void*)src,
+            (void*)&p_input0[config->in_data_offset],
+            config->src_dim1_pitch,
+            config->in_dim1_pitch,
+            config->src_dim2_pitch,
+            config->in_dim2_pitch,
+            config->src_dim1_size,
+            config->in_rows_firstdma,
+            config->src_dim3_size);
+    
+    // Wait for all initial DMA transfers to complete
+    idma_hw_wait_all(0);  // coeff + bias on ch0
+    idma_hw_wait_all(1);  // input on ch1
+    
+    // ========================================================================
+    // Configure Input Tile Descriptor
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);
+    XAI_TILE3D_SET_DIM2(&tile_input, (config->in_rows_firstdma - config->in_dim2_edge1));
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, config->in_dim3_edge1);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, config->in_dim3_edge2);
+    
+    // Frame size for edge extension
+    frame_size_input.dim1Size = config->src_dim1_size;
+    frame_size_input.dim2Size = config->src_dim2_size;
+    frame_size_input.dim3Size = config->src_dim3_size;
+    
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor (4D: W×H×C×N)
+    // ========================================================================
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Bias Array
+    // ========================================================================
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Output Tile Descriptor
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->output_rows);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->n_tile_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+    
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+    
+    // ========================================================================
+    // SECTION 3: Tiled Execution Loop (N-tiles × H-tiles)
+    // ========================================================================
+    int last_tile = 1;
+    
+    for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) {
+        int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1);
+        int current_n_size = (idx_n < config->n_tiles - 1) ? 
+                             config->n_tile_size : config->n_tile_size_last;
+        
+        // Update coefficient/bias for N-tile (no outscale)
+        XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n);
+        XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size);
+        
+        XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]);
+        XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size);
+        XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size);
+        
+        XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n);
+        XAI_TILE3D_SET_DIM3(&tile_output, current_n_size);
+        
+        // Process vertical tiles
+        for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) {
+            int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1);
+            
+            // Calculate actual rows for this height tile
+            int current_output_rows = (idx_h < config->height_tiles - 1) ? 
+                                      config->output_rows : 
+                                      (config->dst_dim2_size - (config->output_rows * idx_h));
+            int current_row_size = config->dst_dim1_size * current_output_rows;
+            
+            // ================================================================
+            // Prefetch Next Input Tile (Ping-Pong Buffering)
+            // ================================================================
+            if (!last_h_tile) {
+                int temp_idx_h;
+                inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1);
+                _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size);
+
+                dma_3dm(1,
+                        (void*)&(src[max(((config->stride_y * config->output_rows * temp_idx_h - config->in_dim2_edge1) * config->src_dim1_size),0)]),
+                        (void*)&(p_input1[max((((-(config->output_rows)* config->in_dim1_pitch))*(temp_idx_h))+(config->in_data_offset),1)]),
+                        config->src_dim1_pitch,
+                        config->in_dim1_pitch,
+                        config->src_dim2_pitch,
+                        config->in_dim2_pitch,
+                        config->src_dim1_size,
+                        min(((config->stride_y * config->output_rows)*(temp_idx_h))+(config->input_rows-config->in_dim2_edge1),min((((-(config->stride_y * config->output_rows)))*(temp_idx_h))+(config->src_dim2_size + config->in_dim2_edge2),config->input_rows)),
+                        config->src_dim3_size);
+            }
+            
+            // ================================================================
+            // Update Tile Descriptors for Current Iteration
+            // ================================================================
+            XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+            XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]);
+            XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->output_rows)*(idx_h));
+            XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1);
+            XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0]));
+            XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->output_rows)*(idx_h));
+            XAI_TILE3D_SET_DIM2(&tile_output, current_output_rows);
+            
+            // ================================================================
+            // Perform Edge Extension and Convolution (non-VQ API)
+            // ================================================================
+            xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size_input);
+
+            XAI_ERR_TYPE status = xaiConvolved3D_S_3x3j2d1_S8S8IX_MOW_WHD(
+                                        &(tile_input),
+                                        &(tile_coeff),
+                                        &(tile_bias),
+                                        &(tile_output),
+                                        &(params));
+
+            if (status != XAI_ERR_OK) {
+                return status;
+            }
+
+            // ================================================================
+            // Prefetch next coefficient tile (if needed)
+            // ================================================================
+            if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) {
+
+                int temp_idx_n;
+                int temp_idx_h;
+                inc_iter_to_temp(&(temp_idx_n),idx_n, config->n_tiles, inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1));
+
+                dma_1dm(0, (coeff_ptr + (config->coeff_buffer_size * temp_idx_n)), &(p_coeff[0]), (((temp_idx_n) < (config->n_tiles-1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last)));
+            }
+            
+            // ================================================================
+            // Write Output Tile to System Memory
+            // ================================================================
+            dma_2dm(0,
+                    &(p_output1[0]),
+                    &dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n))+((config->out_dim2_pitch)*(idx_h))],
+                    config->out_dim2_pitch,
+                    config->dst_dim2_pitch,
+                    current_row_size,
+                    current_n_size);
+
+            swap_buffers(&(p_output0), &(p_output1));
+            swap_buffers(&(p_input0), &(p_input1));
+        }
+    }
+    
+    // Wait for final output DMA to complete before returning
+    idma_hw_wait_all(0);
+
+    return XAI_ERR_OK;
+}
+
+// Non-VQ cache version - per-tensor output scaling
+XAI_ERR_TYPE conv_exec_3x3j2d1_cache(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // Setup source raw tile descriptor
+    // ========================================================================
+    xai_tile3D src_raw;
+    XAI_TILE3D_SET_BUFF_PTR(&src_raw, src);
+    XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&src_raw, src);
+    XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0);
+
+    // ========================================================================
+    // Get padded input buffer from allocator
+    // ========================================================================
+    int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2;
+    int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1);
+    int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2;
+    int dim2_pitch = dim1_pitch * padded_dim2;
+    int input_buffer_size = dim2_pitch * config->src_dim3_size;
+    
+    int8_t* padded_input = get_cache_padded_input();
+    
+    if (input_buffer_size > (int)get_cache_padded_input_size()) {
+        return XAI_ERR_DATASIZE;
+    }
+    
+    memset(padded_input, config->input_zero_point, input_buffer_size);
+
+    // ========================================================================
+    // Setup padded input tile descriptor
+    // ========================================================================
+    int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1;
+    
+    xai_tile3D tile_input;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0);
+
+    // Copy raw input to padded buffer
+    // Safe manual copy: avoids SIMD overread near source buffer boundary
+    for (int d = 0; d < config->src_dim3_size; d++) {
+        for (int h = 0; h < config->src_dim2_size; h++) {
+            memcpy(&padded_input[data_offset + d * dim2_pitch + h * dim1_pitch],
+                   &src[d * config->src_dim2_pitch + h * config->src_dim1_pitch],
+                   config->src_dim1_size);
+        }
+    }
+    (void)src_raw;
+    
+    xai_size3D frame_size;
+    frame_size.dim1Size = config->dst_dim1_size * config->stride_x;
+    frame_size.dim2Size = config->dst_dim2_size * config->stride_y;
+    frame_size.dim3Size = config->src_dim3_size;
+    
+    xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size);
+
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor
+    // ========================================================================
+    xai_tile4D tile_coeff;
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr);    
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);   
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Bias Array
+    // ========================================================================
+    xai_array tile_bias;
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr);   
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Output Tile Descriptor
+    // ========================================================================
+    xai_tile3D tile_output;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+
+    // ========================================================================
+    // Execute convolution (non-VQ API)
+    // ========================================================================
+    XAI_ERR_TYPE status = xaiConvolved3D(&tile_input, &tile_coeff, &tile_bias, 
+                                          &tile_output, &params);
+
+    // Writeback output from cache to system memory for DMA coherency
+    xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size);
+
+    return status;
+}
\ No newline at end of file
diff --git a/backends/cadence/vision/operators/conv/conv_exec_7x7j2d1.c b/backends/cadence/vision/operators/conv/conv_exec_7x7j2d1.c
new file mode 100644
index 00000000000..36709da0e78
--- /dev/null
+++ b/backends/cadence/vision/operators/conv/conv_exec_7x7j2d1.c
@@ -0,0 +1,1088 @@
+#include "kernel_executors.h"
+#include "memory_manager.h"
+#include "dma.h"
+#include "utils.h"
+#include <xai_cnn_api.h>
+#include <string.h>
+#include <xtensa/hal.h>
+
+// conv 7x7j2d1 VQ executor with DMA (per-channel output scaling)
+XAI_ERR_TYPE conv_exec_7x7j2d1VQ(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    int8_t* outScale_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // SECTION 1: DRAM Buffer Allocation
+    // ========================================================================
+    int dram0_used = 0;
+    int dram1_used = 0;
+    
+    int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, 
+                                             config->input_ping_dram, 
+                                             &dram0_used, &dram1_used);
+    int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, 
+                                             config->input_pong_dram, 
+                                             &dram0_used, &dram1_used);
+    int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, 
+                                            config->coeff_dram, 
+                                            &dram0_used, &dram1_used);
+    int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, 
+                                              config->output_ping_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, 
+                                              config->output_pong_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, 
+                                           config->bias_dram, 
+                                           &dram0_used, &dram1_used);
+    int8_t* p_outscale = allocate_dram_buffer(config->outscale_buffer_size, 
+                                               config->outscale_dram, 
+                                               &dram0_used, &dram1_used);
+    
+    if (!p_input0 || !p_input1 || !p_coeff || 
+        !p_output0 || !p_output1 || !p_bias || !p_outscale) {
+        return (-1);
+    }
+    
+    // ========================================================================
+    // SECTION 2: Initialize XAI Tile Descriptors
+    // ========================================================================
+    xai_tile3D tile_input;
+    xai_size3D frame_size_input;
+    xai_tile4D tile_coeff;
+    xai_array tile_bias;
+    xai_array tile_outscale;
+    xai_tile3D tile_output;
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    
+    /* Initialize DMA engines */
+    dma_3dm_init(1);
+    dma_2dm_init(0);
+
+    // Transfer constant data
+    dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size);
+    dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size);
+    dma_1dm(0, outScale_ptr, p_outscale, config->outscale_buffer_size);
+    
+    // Initialize input buffer and load first tile
+    _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size);
+    
+    // First DMA: load IN_ROWS_FIRSTDMA rows at data offset
+    dma_3dm(1,
+            (void*)src,
+            (void*)&p_input0[config->in_data_offset],
+            config->src_dim1_pitch,
+            config->in_dim1_pitch,
+            config->src_dim2_pitch,
+            config->in_dim2_pitch,
+            config->src_dim1_size,
+            config->in_rows_firstdma,
+            config->src_dim3_size);
+    
+    // Wait for all initial DMA transfers to complete
+    idma_hw_wait_all(0);  // coeff + bias + outscale on ch0
+    idma_hw_wait_all(1);  // input on ch1
+    
+    // ========================================================================
+    // Configure Input Tile Descriptor
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);
+    XAI_TILE3D_SET_DIM2(&tile_input, (config->in_rows_firstdma - config->in_dim2_edge1));
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, config->in_dim3_edge1);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, config->in_dim3_edge2);
+    
+    // Frame size for edge extension
+    frame_size_input.dim1Size = config->src_dim1_size;
+    frame_size_input.dim2Size = config->src_dim2_size;
+    frame_size_input.dim3Size = config->src_dim3_size;
+    
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor (4D: W×H×C×N)
+    // ========================================================================
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Bias Array
+    // ========================================================================
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Output Scale Array
+    // ========================================================================
+    XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, p_outscale);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->outscale_buffer_size);
+    XAI_ARRAY_SET_DATA_PTR(&tile_outscale, p_outscale);
+    XAI_ARRAY_SET_WIDTH(&tile_outscale, config->n_tile_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1);
+    XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16);
+    XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Output Tile Descriptor (matches convIdma.c)
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->out_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->out_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->out_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+    
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+    
+    // ========================================================================
+    // SECTION 3: Tiled Execution Loop (N-tiles × H-tiles)
+    // ========================================================================
+    int last_tile = 1;
+    
+    for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) {
+        int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1);
+        int current_n_size = (idx_n < config->n_tiles - 1) ? 
+                             config->n_tile_size : config->n_tile_size_last;
+        
+        // Update coefficient/bias/outscale for N-tile
+        XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n);
+        XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size);
+        
+        XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]);
+        XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size);
+        XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size);
+        
+        XAI_ARRAY_SET_DATA_PTR(&tile_outscale, &p_outscale[config->n_tile_size * 2 * idx_n]);
+        XAI_ARRAY_SET_WIDTH(&tile_outscale, current_n_size);
+        XAI_ARRAY_SET_CAPACITY(&tile_outscale, current_n_size);
+        
+        XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n);
+        XAI_TILE3D_SET_DIM3(&tile_output, current_n_size);
+        
+        // Process vertical tiles
+        for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) {
+            int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1);
+            
+            // ================================================================
+            // Prefetch Next Input Tile (Ping-Pong Buffering)
+            // ================================================================
+            if (!last_h_tile) {
+                int temp_idx_h;
+                inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1);
+                _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size);
+
+                // Transfer next input tile from system memory to DRAM
+                // Generalized formulas for variable output_rows
+                // Key insight: (EDGE+1) in original = stride_y * output_rows
+                int stride_output = config->stride_y * config->output_rows;
+                
+                // Calculate row count for this tile
+                int row_count;
+                if (temp_idx_h < (config->height_tiles - 1)) {
+                    // Non-last tiles: min of (progress + buffer_size - edge, remaining_from_end, buffer_size)
+                    int prog_rows = (stride_output * temp_idx_h) + (config->in_dim2_size - config->in_dim2_edge1);
+                    int rem_rows = (config->src_dim2_size + config->in_dim1_edge2) - (stride_output * temp_idx_h);
+                    row_count = min(prog_rows, min(rem_rows, config->in_dim2_size));
+                } else {
+                    // Last tile: transfer remaining rows from source (accounting for source starting offset)
+                    int src_start_row = max(stride_output * temp_idx_h - config->in_dim2_edge1, 0);
+                    row_count = config->src_dim2_size - src_start_row;
+                }
+                
+                dma_3dm(1,
+                    (uint64_t *)&(src[max(((stride_output * temp_idx_h - config->in_dim2_edge1) * config->src_dim1_size),0)]),
+                    (uint64_t *)&(p_input1[config->in_data_offset - min(stride_output * temp_idx_h * config->in_dim1_pitch, config->in_data_offset - config->in_dim2_edge1)]),
+                    config->src_dim1_pitch,
+                    config->in_dim1_pitch,
+                    config->src_dim2_pitch,
+                    config->in_dim2_pitch,
+                    config->src_dim1_size,
+                    row_count,
+                    config->src_dim3_size);
+            }
+            
+            // ================================================================
+            // Update Tile Descriptors for Current Iteration
+            // ================================================================
+            XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+            XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]);
+            // Input vertical coordinate: stride * tile_size * tile_index (matches convIdma.c)
+            XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->out_dim2_size)*(idx_h));
+            XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1);
+            XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0]));
+            XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->out_dim2_size)*(idx_h));
+            
+            // ================================================================
+            // Perform Edge Extension and Convolution
+            // ================================================================
+            xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size_input);
+
+            XAI_ERR_TYPE status = xaiConvolvedVQ3D_S_7x7j2d1_S8S8IX_MOW_WHD(
+                                        &(tile_input),
+                                        &(tile_coeff),
+                                        &(tile_bias),
+                                        &(tile_outscale),
+                                        &(tile_output),
+                                        &(params));
+            
+            if (status != XAI_ERR_OK) {
+                return status;
+            }
+
+            // ================================================================
+            // Prefetch next coefficient tile (if needed)
+            // ================================================================
+            if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) {
+                int temp_idx_n;
+                int temp_idx_h;
+                inc_iter_to_temp(&(temp_idx_n), idx_n, config->n_tiles, inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1));
+                dma_1dm(0, (coeff_ptr + (config->coeff_buffer_size * temp_idx_n)), &(p_coeff[0]), (((temp_idx_n) < (config->n_tiles - 1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last)));
+            }
+
+            // ================================================================
+            // Write Output Tile to System Memory (matches convIdma.c formula)
+            // ================================================================
+            // Fix: For the last height tile, only write the valid output rows
+            // to avoid spilling into the next channel's memory.
+            {
+                int current_output_rows = (idx_h < config->height_tiles - 1)
+                    ? config->output_rows
+                    : (config->dst_dim2_size - (config->output_rows * idx_h));
+                int output_row_bytes = config->out_dim1_pitch * current_output_rows;
+                dma_2dm(0,
+                        &(p_output1[0]),
+                        &dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n)) + ((config->out_dim1_pitch * config->output_rows)*(idx_h))],
+                        config->out_dim2_pitch,
+                        config->dst_dim2_pitch,
+                        output_row_bytes,
+                        current_n_size);
+            }
+
+            // Swap ping-pong buffers for next iteration
+            swap_buffers(&(p_output0), &(p_output1));
+            swap_buffers(&(p_input0), &(p_input1));
+        }
+    }
+    
+    // Wait for final output DMA to complete before returning
+    idma_hw_wait_all(0);
+
+    return XAI_ERR_OK;
+}
+
+// conv 7x7j2d1 executor with DMA (per-tensor output scaling)
+XAI_ERR_TYPE conv_exec_7x7j2d1(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // SECTION 1: DRAM Buffer Allocation
+    // ========================================================================
+    int dram0_used = 0;
+    int dram1_used = 0;
+    
+    int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, 
+                                             config->input_ping_dram, 
+                                             &dram0_used, &dram1_used);
+    int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, 
+                                             config->input_pong_dram, 
+                                             &dram0_used, &dram1_used);
+    int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, 
+                                            config->coeff_dram, 
+                                            &dram0_used, &dram1_used);
+    int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, 
+                                              config->output_ping_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, 
+                                              config->output_pong_dram, 
+                                              &dram0_used, &dram1_used);
+    int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, 
+                                           config->bias_dram, 
+                                           &dram0_used, &dram1_used);
+    
+    if (!p_input0 || !p_input1 || !p_coeff || 
+        !p_output0 || !p_output1 || !p_bias) {
+        return (-1);
+    }
+    
+    // ========================================================================
+    // SECTION 2: Initialize XAI Tile Descriptors
+    // ========================================================================
+    xai_tile3D tile_input;
+    xai_size3D frame_size_input;
+    xai_tile4D tile_coeff;
+    xai_array tile_bias;
+    xai_tile3D tile_output;
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    
+    /* Initialize DMA engines */
+    dma_3dm_init(1);
+    dma_2dm_init(0);
+
+    // Transfer constant data
+    dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size);
+    dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size);
+    
+    // Initialize input buffer and load first tile
+    _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size);
+    
+    // First DMA: load IN_ROWS_FIRSTDMA rows at data offset
+    dma_3dm(1,
+            (void*)src,
+            (void*)&p_input0[config->in_data_offset],
+            config->src_dim1_pitch,
+            config->in_dim1_pitch,
+            config->src_dim2_pitch,
+            config->in_dim2_pitch,
+            config->src_dim1_size,
+            config->in_rows_firstdma,
+            config->src_dim3_size);
+    
+    // Wait for all initial DMA transfers to complete
+    idma_hw_wait_all(0);  // coeff + bias on ch0
+    idma_hw_wait_all(1);  // input on ch1
+    
+    // ========================================================================
+    // Configure Input Tile Descriptor
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);
+    XAI_TILE3D_SET_DIM2(&tile_input, (config->in_rows_firstdma - config->in_dim2_edge1));
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, config->in_dim3_edge1);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, config->in_dim3_edge2);
+    
+    // Frame size for edge extension
+    frame_size_input.dim1Size = config->src_dim1_size;
+    frame_size_input.dim2Size = config->src_dim2_size;
+    frame_size_input.dim3Size = config->src_dim3_size;
+    
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor (4D: W×H×C×N)
+    // ========================================================================
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Bias Array
+    // ========================================================================
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size);
+    
+    // ========================================================================
+    // Configure Output Tile Descriptor (matches convIdma.c)
+    // ========================================================================
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->out_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->out_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->out_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+    
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+
+
+    //print config eg . config->accum_shift, config->dilation, config->flags, config->output_scale, config->output_shift, config->relu_max, config->relu_min, config->stride_x, config->stride_y
+    // ========================================================================
+    // SECTION 3: Tiled Execution Loop (N-tiles × H-tiles)
+    // ========================================================================
+    int last_tile = 1;
+    
+    for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) {
+        int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1);
+        int current_n_size = (idx_n < config->n_tiles - 1) ? 
+                             config->n_tile_size : config->n_tile_size_last;
+        
+        // Update coefficient/bias for N-tile
+        XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n);
+        XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size);
+        
+        XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]);
+        XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size);
+        XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size);
+        
+        XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n);
+        XAI_TILE3D_SET_DIM3(&tile_output, current_n_size);
+        
+        // Process vertical tiles
+        for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) {
+            int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1);
+            
+            // ================================================================
+            // Prefetch Next Input Tile (Ping-Pong Buffering)
+            // ================================================================
+            if (!last_h_tile) {
+                int temp_idx_h;
+                inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1);
+                _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size);
+
+                // Transfer next input tile from system memory to DRAM
+                // Generalized formulas for variable output_rows
+                // Key insight: (EDGE+1) in original = stride_y * output_rows
+                int stride_output = config->stride_y * config->output_rows;
+                
+                // Calculate row count for this tile
+                int row_count;
+                if (temp_idx_h < (config->height_tiles - 1)) {
+                    // Non-last tiles: min of (progress + buffer_size - edge, remaining_from_end, buffer_size)
+                    int prog_rows = (stride_output * temp_idx_h) + (config->in_dim2_size - config->in_dim2_edge1);
+                    int rem_rows = (config->src_dim2_size + config->in_dim1_edge2) - (stride_output * temp_idx_h);
+                    row_count = min(prog_rows, min(rem_rows, config->in_dim2_size));
+                } else {
+                    // Last tile: transfer remaining rows from source (accounting for source starting offset)
+                    int src_start_row = max(stride_output * temp_idx_h - config->in_dim2_edge1, 0);
+                    row_count = config->src_dim2_size - src_start_row;
+                }
+                
+                dma_3dm(1,
+                    (uint64_t *)&(src[max(((stride_output * temp_idx_h - config->in_dim2_edge1) * config->src_dim1_size),0)]),
+                    (uint64_t *)&(p_input1[config->in_data_offset - min(stride_output * temp_idx_h * config->in_dim1_pitch, config->in_data_offset - config->in_dim2_edge1)]),
+                    config->src_dim1_pitch,
+                    config->in_dim1_pitch,
+                    config->src_dim2_pitch,
+                    config->in_dim2_pitch,
+                    config->src_dim1_size,
+                    row_count,
+                    config->src_dim3_size);
+            }
+            
+            // ================================================================
+            // Update Tile Descriptors for Current Iteration
+            // ================================================================
+            XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0);
+            XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]);
+            // Input vertical coordinate: stride * tile_size * tile_index (matches convIdma.c)
+            XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->out_dim2_size)*(idx_h));
+            XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1);
+            XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0]));
+            XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->out_dim2_size)*(idx_h));
+            
+            // ================================================================
+            // Perform Edge Extension and Convolution
+            // ================================================================
+            xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size_input);
+
+            XAI_ERR_TYPE status = xaiConvolved3D_S_7x7j2d1_S8S8IX_MOW_WHD(
+                                        &(tile_input),
+                                        &(tile_coeff),
+                                        &(tile_bias),
+                                        &(tile_output),
+                                        &(params));
+            
+            if (status != XAI_ERR_OK) {
+                return status;
+            }
+
+            // ================================================================
+            // Prefetch next coefficient tile (if needed)
+            // ================================================================
+            if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) {
+                int temp_idx_n;
+                int temp_idx_h;
+                inc_iter_to_temp(&(temp_idx_n), idx_n, config->n_tiles, inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1));
+                dma_1dm(0, (coeff_ptr + (config->coeff_buffer_size * temp_idx_n)), &(p_coeff[0]), (((temp_idx_n) < (config->n_tiles - 1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last)));
+            }
+
+            // ================================================================
+            // Write Output Tile to System Memory (matches convIdma.c formula)
+            // ================================================================
+            // Fix: For the last height tile, only write the valid output rows
+            // to avoid spilling into the next channel's memory.
+            {
+                int current_output_rows = (idx_h < config->height_tiles - 1)
+                    ? config->output_rows
+                    : (config->dst_dim2_size - (config->output_rows * idx_h));
+                int output_row_bytes = config->out_dim1_pitch * current_output_rows;
+                dma_2dm(0,
+                        &(p_output1[0]),
+                        &dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n)) + ((config->out_dim1_pitch * config->output_rows)*(idx_h))],
+                        config->out_dim2_pitch,
+                        config->dst_dim2_pitch,
+                        output_row_bytes,
+                        current_n_size);
+            }
+
+            // Swap ping-pong buffers for next iteration
+            swap_buffers(&(p_output0), &(p_output1));
+            swap_buffers(&(p_input0), &(p_input1));
+        }
+    }
+    
+    // Wait for final output DMA to complete before returning
+    idma_hw_wait_all(0);
+    return XAI_ERR_OK;
+}
+
+// conv 7x7j2d1 executor with caching (no DMA)
+// All data stays in system memory and is accessed through processor cache
+XAI_ERR_TYPE conv_exec_7x7j2d1_cache(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // Setup source raw tile descriptor (points to raw input without padding)
+    // ========================================================================
+    xai_tile3D src_raw;
+    XAI_TILE3D_SET_BUFF_PTR(&src_raw, src);
+    XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&src_raw, src);
+    XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0);
+
+    // ========================================================================
+    // Get padded input buffer from allocator (shared across cache kernels)
+    // ========================================================================
+    int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2;
+    int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1);
+    int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2;
+    int dim2_pitch = dim1_pitch * padded_dim2;
+    int input_buffer_size = dim2_pitch * config->src_dim3_size;
+    
+    // Get shared padded input buffer from allocator
+    int8_t* padded_input = get_cache_padded_input();
+    
+    if (input_buffer_size > (int)get_cache_padded_input_size()) {
+        return XAI_ERR_DATASIZE;
+    }
+    
+    // Zero-fill the padded buffer
+    memset(padded_input, config->input_zero_point, input_buffer_size);
+
+    // ========================================================================
+    // Setup padded input tile descriptor
+    // ========================================================================
+    int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1;
+    
+    xai_tile3D tile_input;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0);
+
+    // ========================================================================
+    // Copy raw input to padded buffer and extend edges
+    // ========================================================================
+#ifdef USE_DMA_FOR_CACHE_COPY
+    // Use DMA 3D transfer to copy input data into padded buffer at data_offset
+    dma_3dm(0,
+            /* src */           src,
+            /* dst */           &padded_input[data_offset],
+            /* src_row_pitch */ config->src_dim1_pitch,
+            /* dst_row_pitch */ dim1_pitch,
+            /* src_tile_pitch */ config->src_dim2_pitch,
+            /* dst_tile_pitch */ dim2_pitch,
+            /* row_sz */        config->src_dim1_size,
+            /* nrows */         config->src_dim2_size,
+            /* ntiles */        config->src_dim3_size);
+#else
+    // Use library tile copy function (no DMA required)
+    // Safe manual copy: avoids SIMD overread near source buffer boundary
+    for (int d = 0; d < config->src_dim3_size; d++) {
+        for (int h = 0; h < config->src_dim2_size; h++) {
+            memcpy(&padded_input[data_offset + d * dim2_pitch + h * dim1_pitch],
+                   &src[d * config->src_dim2_pitch + h * config->src_dim1_pitch],
+                   config->src_dim1_size);
+        }
+    }
+    (void)src_raw;
+#endif
+    
+    xai_size3D frame_size;
+    frame_size.dim1Size = config->dst_dim1_size * config->stride_x;
+    frame_size.dim2Size = config->dst_dim2_size * config->stride_y;
+    frame_size.dim3Size = config->src_dim3_size;
+    
+    xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size);
+
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor (4D: W×H×C×N)  
+    // ========================================================================
+    xai_tile4D tile_coeff;
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr);    
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);   
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Bias Array 
+    // ========================================================================
+    xai_array tile_bias;
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr);   
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size);
+
+
+    // ========================================================================
+    // Configure Output Tile Descriptor (points to system memory)
+    // ========================================================================
+    xai_tile3D tile_output;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+
+    // ========================================================================
+    // Execute convolution using specific optimized kernel directly
+    // (bypasses xaiConvolved3D dispatcher for deterministic variant selection)
+    // ========================================================================
+    XAI_ERR_TYPE status = xaiConvolved3D_S_7x7j2d1_S8S8IX_MOW_WHD(
+                                        &tile_input, &tile_coeff, &tile_bias,
+                                        &tile_output, &params);
+
+    // Writeback output from cache to system memory for DMA coherency
+    xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size);
+
+    return status;
+}
+
+
+XAI_ERR_TYPE conv_exec_7x7j2d1VQ_cache(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    int8_t* outScale_ptr,
+    const conv_layer_config_t* config)
+{
+    // ========================================================================
+    // Setup source raw tile descriptor (points to raw input without padding)
+    // ========================================================================
+    xai_tile3D src_raw;
+    XAI_TILE3D_SET_BUFF_PTR(&src_raw, src);
+    XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&src_raw, src);
+    XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0);
+
+    // ========================================================================
+    // Get padded input buffer from allocator (shared across cache kernels)
+    // ========================================================================
+    int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2;
+    int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1);
+    int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2;
+    int dim2_pitch = dim1_pitch * padded_dim2;
+    int input_buffer_size = dim2_pitch * config->src_dim3_size;
+    
+    // Get shared padded input buffer from allocator
+    int8_t* padded_input = get_cache_padded_input();
+    
+    if (input_buffer_size > (int)get_cache_padded_input_size()) {
+        return XAI_ERR_DATASIZE;
+    }
+    
+    // Zero-fill the padded buffer
+    memset(padded_input, config->input_zero_point, input_buffer_size);
+
+    // ========================================================================
+    // Setup padded input tile descriptor
+    // ========================================================================
+    int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1;
+    
+    xai_tile3D tile_input;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0);
+
+    // ========================================================================
+    // Copy raw input to padded buffer and extend edges
+    // ========================================================================
+#ifdef USE_DMA_FOR_CACHE_COPY
+    // Use DMA 3D transfer to copy input data into padded buffer at data_offset
+    dma_3dm(0,
+            /* src */           src,
+            /* dst */           &padded_input[data_offset],
+            /* src_row_pitch */ config->src_dim1_pitch,
+            /* dst_row_pitch */ dim1_pitch,
+            /* src_tile_pitch */ config->src_dim2_pitch,
+            /* dst_tile_pitch */ dim2_pitch,
+            /* row_sz */        config->src_dim1_size,
+            /* nrows */         config->src_dim2_size,
+            /* ntiles */        config->src_dim3_size);
+#else
+    // Use library tile copy function (no DMA required)
+    // Safe manual copy: avoids SIMD overread near source buffer boundary
+    for (int d = 0; d < config->src_dim3_size; d++) {
+        for (int h = 0; h < config->src_dim2_size; h++) {
+            memcpy(&padded_input[data_offset + d * dim2_pitch + h * dim1_pitch],
+                   &src[d * config->src_dim2_pitch + h * config->src_dim1_pitch],
+                   config->src_dim1_size);
+        }
+    }
+    (void)src_raw;
+#endif
+    
+    xai_size3D frame_size;
+    frame_size.dim1Size = config->dst_dim1_size * config->stride_x;
+    frame_size.dim2Size = config->dst_dim2_size * config->stride_y;
+    frame_size.dim3Size = config->src_dim3_size;
+    
+    xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size);
+
+    // ========================================================================
+    // Configure Coefficient Tile Descriptor (4D: W×H×C×N)  
+    // ========================================================================
+    xai_tile4D tile_coeff;
+    XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr);    
+    XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size);
+    XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr);
+    XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN);
+    XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8);
+    XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0);   
+    XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch);
+    XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch);
+    XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch);
+    XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size);
+    XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size);
+    XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size);
+    XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0);
+    XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Bias Array 
+    // ========================================================================
+    xai_array tile_bias;
+    XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr);   
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4);
+    XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr);
+    XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_bias, 1);
+    XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32);
+    XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Output Scale Array
+    // ========================================================================
+    xai_array tile_outscale;
+    XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, outScale_ptr);
+    XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->dst_dim3_size * 2);
+    XAI_ARRAY_SET_DATA_PTR(&tile_outscale, outScale_ptr);
+    XAI_ARRAY_SET_WIDTH(&tile_outscale, config->dst_dim3_size);
+    XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1);
+    XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16);
+    XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->dst_dim3_size);
+
+    // ========================================================================
+    // Configure Output Tile Descriptor (points to system memory)
+    // ========================================================================
+    xai_tile3D tile_output;
+    XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(&tile_output, dst);
+    XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch);
+    XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0);
+
+    // ========================================================================
+    // Configure Convolution Parameters
+    // ========================================================================
+    xai_cnn_conv_params params;
+    memset(&params, 0, sizeof(params));
+    XAI_CNN_CONV_SET_ACCUM_SHIFT(&params, config->accum_shift);
+    XAI_CNN_CONV_SET_DILATION(&params, config->dilation);
+    XAI_CNN_CONV_SET_FLAGS(&params, config->flags);
+    XAI_CNN_CONV_SET_OUTPUT_SCALE(&params, config->output_scale);
+    XAI_CNN_CONV_SET_OUTPUT_SHIFT(&params, config->output_shift);
+    XAI_CNN_CONV_SET_RELU_MAX(&params, config->relu_max);
+    XAI_CNN_CONV_SET_RELU_MIN(&params, config->relu_min);
+    XAI_CNN_CONV_SET_STRIDEX(&params, config->stride_x);
+    XAI_CNN_CONV_SET_STRIDEY(&params, config->stride_y);
+
+    // ========================================================================
+    // Execute convolution using generic system-memory API
+    // This version accesses data through the processor cache
+    // ========================================================================
+    XAI_ERR_TYPE status = xaiConvolvedVQ3D(&tile_input, &tile_coeff, &tile_bias, 
+                                            &tile_outscale, &tile_output, &params);
+
+    // Writeback output from cache to system memory for DMA coherency
+    xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size);
+
+    return status;
+}
diff --git a/backends/cadence/vision/operators/conv/conv_kernel_dispatcher.c b/backends/cadence/vision/operators/conv/conv_kernel_dispatcher.c
new file mode 100644
index 00000000000..b9c5c326326
--- /dev/null
+++ b/backends/cadence/vision/operators/conv/conv_kernel_dispatcher.c
@@ -0,0 +1,50 @@
+/*
+ * conv_kernel_dispatcher.c
+ *
+ *  Created on: Dec 8, 2025
+ *      Author: Suraj Raut
+ *
+ *  Description:
+ *      Dispatcher that routes convolution execution to kernel-specific executors.
+ *      Each kernel type has its own source file with exact DMA formulas from convIdma.c.
+ */
+
+#include "kernel_executors.h"
+#include <string.h>
+
+/**
+ * Dispatch to appropriate kernel executor based on config->kernel_name
+ */
+XAI_ERR_TYPE conv_execute_kernel(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config)
+{
+    // Dispatch to kernel-specific executor
+    if (strcmp(config->kernel_name, "7x7j2d1_dma") == 0) {
+        return conv_exec_7x7j2d1(src, dst, coeff_ptr, bias_ptr, config);
+    } else if (strcmp(config->kernel_name, "7x7j2d1_no_dma") == 0) {
+        return conv_exec_7x7j2d1_cache(src, dst, coeff_ptr, bias_ptr, config);
+    } else if (strcmp(config->kernel_name, "3x3j1d1_dma") == 0) {
+        return conv_exec_3x3j1d1(src, dst, coeff_ptr, bias_ptr, config);
+    } else if (strcmp(config->kernel_name, "3x3j1d1_no_dma") == 0) {
+        return conv_exec_3x3j1d1_cache(src, dst, coeff_ptr, bias_ptr, config);
+    } else if (strcmp(config->kernel_name, "3x3j2d1_dma") == 0) {
+        return conv_exec_3x3j2d1(src, dst, coeff_ptr, bias_ptr, config);
+    } else if (strcmp(config->kernel_name, "3x3j2d1_no_dma") == 0) {
+        return conv_exec_3x3j2d1_cache(src, dst, coeff_ptr, bias_ptr, config);
+    } else if (strcmp(config->kernel_name, "1x1j2d1_dma") == 0) {
+        return conv_exec_1x1j2d1(src, dst, coeff_ptr, bias_ptr, config);
+    } else if (strcmp(config->kernel_name, "1x1j2d1_no_dma") == 0) {
+        return conv_exec_1x1j2d1_cache(src, dst, coeff_ptr, bias_ptr, config);
+    } else if (strcmp(config->kernel_name, "1x1j1d1_dma") == 0) {
+        return conv_exec_1x1j1d1(src, dst, coeff_ptr, bias_ptr, config);
+    } else if (strcmp(config->kernel_name, "1x1j1d1_no_dma") == 0) {
+        return conv_exec_1x1j1d1_cache(src, dst, coeff_ptr, bias_ptr, config);
+    } else {
+        return XAI_ERR_BADARG;
+    }
+}
+
diff --git a/backends/cadence/vision/operators/conv/kernel_executors.h b/backends/cadence/vision/operators/conv/kernel_executors.h
new file mode 100644
index 00000000000..5cd9c27d818
--- /dev/null
+++ b/backends/cadence/vision/operators/conv/kernel_executors.h
@@ -0,0 +1,137 @@
+/*
+ * kernel_executors.h
+ *
+ *  Created on: Dec 8, 2025
+ *      Author: Suraj Raut
+ *
+ *  Description:
+ *      Header file declaring kernel-specific executor functions.
+ *      Each kernel (7x7j2d1, 3x3j1d1, 3x3j2d1, 1x1j2d1, 1x1j1d1) has its own
+ *      executor with exact DMA formulas matching convIdma.c reference.
+ *      
+ *      Non-VQ versions use per-tensor quantization (no outScale_ptr parameter).
+ */
+
+#ifndef KERNEL_EXECUTORS_H_
+#define KERNEL_EXECUTORS_H_
+
+#include "../layer_configs.h"
+
+/* 
+ * XAI error type: Use actual library type if available, otherwise define locally.
+ * The actual xai_cnn_api.h should be included by implementation files.
+ */
+#ifndef XAI_ERR_TYPE
+typedef int XAI_ERR_TYPE;
+#define XAI_ERR_OK 0
+#define XAI_ERR_BADARG 4
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Execute 7x7 stride-2 convolution with DMA (per-tensor output scaling)
+ */
+XAI_ERR_TYPE conv_exec_7x7j2d1(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config);
+
+/**
+ * Execute 3x3 stride-1 convolution (standard ResNet 3x3 layers)
+ */
+XAI_ERR_TYPE conv_exec_3x3j1d1(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config);
+
+/**
+ * Execute 3x3 stride-2 convolution (downsampling layers)
+ */
+XAI_ERR_TYPE conv_exec_3x3j2d1(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config);
+
+/**
+ * Execute 1x1 stride-2 convolution (projection layers for downsampling)
+ */
+XAI_ERR_TYPE conv_exec_1x1j2d1(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config);
+
+/**
+ * Execute 1x1 stride-1 convolution (bottleneck layers)
+ */
+XAI_ERR_TYPE conv_exec_1x1j1d1(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config);
+
+/*============================================================================
+ * Cache-based executors (no DMA, uses processor cache)
+ *============================================================================*/
+
+XAI_ERR_TYPE conv_exec_7x7j2d1_cache(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config);
+
+XAI_ERR_TYPE conv_exec_3x3j1d1_cache(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config);
+
+XAI_ERR_TYPE conv_exec_3x3j2d1_cache(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config);
+
+XAI_ERR_TYPE conv_exec_1x1j2d1_cache(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config);
+
+XAI_ERR_TYPE conv_exec_1x1j1d1_cache(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config);
+
+/**
+ * Dispatch to appropriate kernel executor based on config->kernel_name
+ */
+XAI_ERR_TYPE conv_execute_kernel(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* KERNEL_EXECUTORS_H_ */
diff --git a/backends/cadence/vision/operators/layer_configs.h b/backends/cadence/vision/operators/layer_configs.h
new file mode 100644
index 00000000000..3f47d4533eb
--- /dev/null
+++ b/backends/cadence/vision/operators/layer_configs.h
@@ -0,0 +1,2403 @@
+/*
+ * layer_configs.h
+ *
+ * Auto-generated conv2d + maxpool layer configurations
+ * Generated from PTE extraction by generate_combined_configs.py
+ *
+ * DO NOT EDIT MANUALLY
+ */
+
+#ifndef LAYER_CONFIGS_H
+#define LAYER_CONFIGS_H
+
+#include <stdint.h>
+#include <stddef.h>  /* for NULL */
+
+#define IDMA_BUFFER_SIZE_DRAM0 (62976)  /* 61 KB */
+#define IDMA_BUFFER_SIZE_DRAM1 (62976)  /* 61 KB */
+
+/* ====================================================================== */
+/*  Conv2d configurations                                              */
+/* ====================================================================== */
+
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* kernel_name;
+    const char* config_key;
+
+    int src_dim1_size;  int src_dim2_size;  int src_dim3_size;
+    int src_dim1_pitch; int src_dim2_pitch;
+
+    int dst_dim1_size;  int dst_dim2_size;  int dst_dim3_size;
+    int dst_dim1_pitch; int dst_dim2_pitch;
+
+    int in_dim1_size;   int in_dim1_pitch;
+    int in_dim2_size;   int in_dim2_pitch;
+    int in_dim1_edge1;  int in_dim1_edge2;
+    int in_dim2_edge1;  int in_dim2_edge2;
+    int in_dim3_edge1;  int in_dim3_edge2;
+    int in_data_offset; int in_rows_firstdma;
+
+    int out_dim1_size;  int out_dim1_pitch;
+    int out_dim2_size;  int out_dim2_pitch;
+    int out_dim3_size;
+
+    int coeff_dim1_size;  int coeff_dim2_size;
+    int coeff_dim3_size;  int coeff_dim4_size;
+    int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch;
+
+    int bias_dim1_size;     int bias_dim2_size;
+    int outscale_dim1_size; int outscale_dim2_size;
+
+    int input_buffer_size;  int coeff_buffer_size;  int output_buffer_size;
+    int bias_buffer_size;   int outscale_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;  int coeff_dram;
+    int output_ping_dram; int output_pong_dram;
+    int bias_dram;        int outscale_dram;
+
+    int n_tile_size; int n_tiles; int n_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int kernel_w; int kernel_h;
+    int stride_x; int stride_y;
+    int padding;  int dilation;
+    int accum_shift; int relu_max; int relu_min;
+    int output_shift; int output_scale; int flags;
+    int input_zero_point;
+} conv_layer_config_t;
+
+#define NUM_CONV_LAYERS 29
+
+static const conv_layer_config_t CONV_LAYER_CONFIGS[] = {
+    {
+        .layer_id = 0,
+        .layer_name = "conv_7x7_s2_ic3_oc64",
+        .kernel_name = "7x7j2d1_dma",
+        .config_key = "3_64_64_64_7_7_32_32_2_2_3_1",
+        .src_dim1_size = 64,
+        .src_dim2_size = 64,
+        .src_dim3_size = 3,
+        .src_dim1_pitch = 64,
+        .src_dim2_pitch = 4096,
+        .dst_dim1_size = 32,
+        .dst_dim2_size = 32,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 32,
+        .dst_dim2_pitch = 1024,
+        .in_dim1_size = 64,
+        .in_dim1_pitch = 70,
+        .in_dim2_size = 35,
+        .in_dim2_pitch = 2450,
+        .in_dim1_edge1 = 3,
+        .in_dim1_edge2 = 3,
+        .in_dim2_edge1 = 3,
+        .in_dim2_edge2 = 3,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 213,
+        .in_rows_firstdma = 32,
+        .out_dim1_size = 32,
+        .out_dim1_pitch = 32,
+        .out_dim2_size = 15,
+        .out_dim2_pitch = 480,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 7,
+        .coeff_dim2_size = 7,
+        .coeff_dim3_size = 3,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 7,
+        .coeff_dim2_pitch = 49,
+        .coeff_dim3_pitch = 147,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 7350,
+        .coeff_buffer_size = 9408,
+        .output_buffer_size = 30720,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 3,
+        .output_rows = 15,
+        .input_rows = 35,
+        .kernel_w = 7,
+        .kernel_h = 7,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 3,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 1,
+        .layer_name = "conv_3x3_s1_ic64_oc64",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "64_16_16_64_3_3_16_16_1_1_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 18,
+        .in_dim2_pitch = 324,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 17,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 16,
+        .out_dim2_pitch = 256,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 576,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 20736,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 16384,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 1,
+        .output_rows = 16,
+        .input_rows = 18,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 2,
+        .layer_name = "conv_1x1_s2_ic64_oc128",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "64_16_16_128_1_1_8_8_2_2_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 15,
+        .in_dim2_pitch = 240,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 15,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 8,
+        .out_dim2_pitch = 64,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 15360,
+        .coeff_buffer_size = 8192,
+        .output_buffer_size = 8192,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 1,
+        .n_tile_size_last = 128,
+        .height_tiles = 1,
+        .output_rows = 8,
+        .input_rows = 15,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 3,
+        .layer_name = "conv_3x3_s2_ic64_oc128",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "64_16_16_128_3_3_8_8_2_2_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 90,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 576,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 5760,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 2,
+        .n_tile_size_last = 64,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 4,
+        .layer_name = "conv_3x3_s1_ic128_oc128",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "128_8_8_128_3_3_8_8_1_1_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 40,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 5120,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 4,
+        .n_tile_size_last = 32,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 5,
+        .layer_name = "conv_1x1_s2_ic128_oc256",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "128_8_8_256_1_1_4_4_2_2_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 7,
+        .in_dim2_pitch = 56,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 7,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 4,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 128,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 7168,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 4096,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 1,
+        .output_rows = 4,
+        .input_rows = 7,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 6,
+        .layer_name = "conv_3x3_s2_ic128_oc256",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "128_8_8_256_3_3_4_4_2_2_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 50,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 6400,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 8,
+        .n_tile_size_last = 32,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 7,
+        .layer_name = "conv_3x3_s1_ic256_oc256",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "256_4_4_256_3_3_4_4_1_1_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 24,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 7,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 6144,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 16,
+        .n_tile_size_last = 16,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 8,
+        .layer_name = "conv_1x1_s2_ic256_oc512",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "256_4_4_512_1_1_2_2_2_2_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 12,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 3072,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 4,
+        .n_tile_size_last = 128,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 9,
+        .layer_name = "conv_3x3_s2_ic256_oc512",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "256_4_4_512_3_3_2_2_2_2_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 30,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 7,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 7680,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 32,
+        .n_tile_size_last = 16,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 10,
+        .layer_name = "conv_3x3_s1_ic512_oc512",
+        .kernel_name = "3x3j1d1_dma",
+        .config_key = "512_2_2_512_3_3_2_2_1_1_1_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 4,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 5,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 4608,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 32,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 64,
+        .n_tile_size_last = 8,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 4,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 11,
+        .layer_name = "conv_1x1_s1_ic64_oc256",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "64_16_16_256_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 7,
+        .in_dim2_pitch = 112,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 7,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 7,
+        .out_dim2_pitch = 112,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 7168,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 28672,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 256,
+        .n_tiles = 1,
+        .n_tile_size_last = 256,
+        .height_tiles = 3,
+        .output_rows = 7,
+        .input_rows = 7,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 12,
+        .layer_name = "conv_1x1_s1_ic64_oc64",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "64_16_16_64_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 64,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 16,
+        .in_dim2_pitch = 256,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 16,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 16,
+        .out_dim2_pitch = 256,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 64,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 64,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 16384,
+        .coeff_buffer_size = 4096,
+        .output_buffer_size = 16384,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 1,
+        .output_rows = 16,
+        .input_rows = 16,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 13,
+        .layer_name = "conv_1x1_s1_ic256_oc64",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "256_16_16_64_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 64,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 10,
+        .in_dim2_pitch = 160,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 10,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 10,
+        .out_dim2_pitch = 160,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 64,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 64,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 64,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 40960,
+        .coeff_buffer_size = 16384,
+        .output_buffer_size = 10240,
+        .bias_buffer_size = 256,
+        .outscale_buffer_size = 128,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 1,
+        .n_tile_size_last = 64,
+        .height_tiles = 2,
+        .output_rows = 10,
+        .input_rows = 10,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 14,
+        .layer_name = "conv_1x1_s2_ic256_oc512",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "256_16_16_512_1_1_8_8_2_2_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 48,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12288,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 2048,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 4,
+        .n_tile_size_last = 128,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 15,
+        .layer_name = "conv_1x1_s1_ic256_oc128",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "256_16_16_128_1_1_16_16_1_1_0_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 16,
+        .dst_dim2_size = 16,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 16,
+        .dst_dim2_pitch = 256,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 16,
+        .in_dim2_size = 7,
+        .in_dim2_pitch = 112,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 7,
+        .out_dim1_size = 16,
+        .out_dim1_pitch = 16,
+        .out_dim2_size = 7,
+        .out_dim2_pitch = 112,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 28672,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 14336,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 1,
+        .n_tile_size_last = 128,
+        .height_tiles = 3,
+        .output_rows = 7,
+        .input_rows = 7,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 16,
+        .layer_name = "conv_3x3_s2_ic128_oc128",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "128_16_16_128_3_3_8_8_2_2_1_1",
+        .src_dim1_size = 16,
+        .src_dim2_size = 16,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 16,
+        .src_dim2_pitch = 256,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 16,
+        .in_dim1_pitch = 18,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 90,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 19,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 1152,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 11520,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 4,
+        .n_tile_size_last = 32,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 17,
+        .layer_name = "conv_1x1_s1_ic128_oc512",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "128_8_8_512_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 128,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 256,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 128,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 128,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 4096,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 256,
+        .n_tiles = 2,
+        .n_tile_size_last = 256,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 18,
+        .layer_name = "conv_1x1_s1_ic512_oc128",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "512_8_8_128_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 128,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 128,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 128,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 128,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 512,
+        .outscale_buffer_size = 256,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 2,
+        .n_tile_size_last = 64,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 19,
+        .layer_name = "conv_1x1_s2_ic512_oc1024",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "512_8_8_1024_1_1_4_4_2_2_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 1024,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 24,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 1024,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 1024,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 1024,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12288,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 512,
+        .bias_buffer_size = 4096,
+        .outscale_buffer_size = 2048,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 16,
+        .n_tile_size_last = 64,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 20,
+        .layer_name = "conv_1x1_s1_ic512_oc256",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "512_8_8_256_1_1_8_8_1_1_0_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 8,
+        .dst_dim2_size = 8,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 8,
+        .dst_dim2_pitch = 64,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 8,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 16,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 8,
+        .out_dim1_pitch = 8,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 16,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 4,
+        .n_tile_size_last = 64,
+        .height_tiles = 4,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 21,
+        .layer_name = "conv_3x3_s2_ic256_oc256",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "256_8_8_256_3_3_4_4_2_2_1_1",
+        .src_dim1_size = 8,
+        .src_dim2_size = 8,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 8,
+        .src_dim2_pitch = 64,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 8,
+        .in_dim1_pitch = 10,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 50,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 11,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 2304,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12800,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 16,
+        .n_tile_size_last = 16,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 22,
+        .layer_name = "conv_1x1_s1_ic256_oc1024",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "256_4_4_1024_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 256,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 1024,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 128,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 256,
+        .coeff_dim4_size = 1024,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 256,
+        .bias_dim1_size = 1024,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 1024,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 1024,
+        .bias_buffer_size = 4096,
+        .outscale_buffer_size = 2048,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 128,
+        .n_tiles = 8,
+        .n_tile_size_last = 128,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 23,
+        .layer_name = "conv_1x1_s1_ic1024_oc256",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "1024_4_4_256_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 256,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 256,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 256,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 256,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 1024,
+        .outscale_buffer_size = 512,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 8,
+        .n_tile_size_last = 32,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 24,
+        .layer_name = "conv_1x1_s2_ic1024_oc2048",
+        .kernel_name = "1x1j2d1_dma",
+        .config_key = "1024_4_4_2048_1_1_2_2_2_2_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 2048,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 3,
+        .in_dim2_pitch = 12,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 3,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 2048,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 2048,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 2048,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 12288,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 128,
+        .bias_buffer_size = 8192,
+        .outscale_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 64,
+        .n_tile_size_last = 32,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 3,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 25,
+        .layer_name = "conv_1x1_s1_ic1024_oc512",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "1024_4_4_512_1_1_4_4_1_1_0_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 1024,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 4,
+        .dst_dim2_size = 4,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 4,
+        .dst_dim2_pitch = 16,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 4,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 8,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 4,
+        .out_dim1_pitch = 4,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 8,
+        .out_dim3_size = 32,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 1024,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 1024,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 32,
+        .n_tiles = 16,
+        .n_tile_size_last = 32,
+        .height_tiles = 2,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 26,
+        .layer_name = "conv_3x3_s2_ic512_oc512",
+        .kernel_name = "3x3j2d1_dma",
+        .config_key = "512_4_4_512_3_3_2_2_2_2_1_1",
+        .src_dim1_size = 4,
+        .src_dim2_size = 4,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 4,
+        .src_dim2_pitch = 16,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 4,
+        .in_dim1_pitch = 6,
+        .in_dim2_size = 5,
+        .in_dim2_pitch = 30,
+        .in_dim1_edge1 = 1,
+        .in_dim1_edge2 = 1,
+        .in_dim2_edge1 = 1,
+        .in_dim2_edge2 = 1,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 7,
+        .in_rows_firstdma = 4,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 8,
+        .coeff_dim1_size = 3,
+        .coeff_dim2_size = 3,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 3,
+        .coeff_dim2_pitch = 9,
+        .coeff_dim3_pitch = 4608,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 15360,
+        .coeff_buffer_size = 36864,
+        .output_buffer_size = 32,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 8,
+        .n_tiles = 64,
+        .n_tile_size_last = 8,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 5,
+        .kernel_w = 3,
+        .kernel_h = 3,
+        .stride_x = 2,
+        .stride_y = 2,
+        .padding = 1,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 27,
+        .layer_name = "conv_1x1_s1_ic512_oc2048",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "512_2_2_2048_1_1_2_2_1_1_0_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 512,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 2048,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 2,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 4,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 64,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 512,
+        .coeff_dim4_size = 2048,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 512,
+        .bias_dim1_size = 2048,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 2048,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 2048,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 256,
+        .bias_buffer_size = 8192,
+        .outscale_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 64,
+        .n_tiles = 32,
+        .n_tile_size_last = 64,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+    {
+        .layer_id = 28,
+        .layer_name = "conv_1x1_s1_ic2048_oc512",
+        .kernel_name = "1x1j1d1_dma",
+        .config_key = "2048_2_2_512_1_1_2_2_1_1_0_1",
+        .src_dim1_size = 2,
+        .src_dim2_size = 2,
+        .src_dim3_size = 2048,
+        .src_dim1_pitch = 2,
+        .src_dim2_pitch = 4,
+        .dst_dim1_size = 2,
+        .dst_dim2_size = 2,
+        .dst_dim3_size = 512,
+        .dst_dim1_pitch = 2,
+        .dst_dim2_pitch = 4,
+        .in_dim1_size = 2,
+        .in_dim1_pitch = 2,
+        .in_dim2_size = 2,
+        .in_dim2_pitch = 4,
+        .in_dim1_edge1 = 0,
+        .in_dim1_edge2 = 0,
+        .in_dim2_edge1 = 0,
+        .in_dim2_edge2 = 0,
+        .in_dim3_edge1 = 0,
+        .in_dim3_edge2 = 0,
+        .in_data_offset = 0,
+        .in_rows_firstdma = 2,
+        .out_dim1_size = 2,
+        .out_dim1_pitch = 2,
+        .out_dim2_size = 2,
+        .out_dim2_pitch = 4,
+        .out_dim3_size = 16,
+        .coeff_dim1_size = 1,
+        .coeff_dim2_size = 1,
+        .coeff_dim3_size = 2048,
+        .coeff_dim4_size = 512,
+        .coeff_dim1_pitch = 1,
+        .coeff_dim2_pitch = 1,
+        .coeff_dim3_pitch = 2048,
+        .bias_dim1_size = 512,
+        .bias_dim2_size = 1,
+        .outscale_dim1_size = 512,
+        .outscale_dim2_size = 1,
+        .input_buffer_size = 8192,
+        .coeff_buffer_size = 32768,
+        .output_buffer_size = 64,
+        .bias_buffer_size = 2048,
+        .outscale_buffer_size = 1024,
+        .input_ping_dram = 0,
+        .input_pong_dram = 0,
+        .coeff_dram = 0,
+        .output_ping_dram = 1,
+        .output_pong_dram = 1,
+        .bias_dram = 1,
+        .outscale_dram = 1,
+        .n_tile_size = 16,
+        .n_tiles = 32,
+        .n_tile_size_last = 16,
+        .height_tiles = 1,
+        .output_rows = 2,
+        .input_rows = 2,
+        .kernel_w = 1,
+        .kernel_h = 1,
+        .stride_x = 1,
+        .stride_y = 1,
+        .padding = 0,
+        .dilation = 1,
+        .accum_shift = 8,
+        .relu_max = 4000,
+        .relu_min = 0,
+        .output_shift = 11,
+        .output_scale = 0,
+        .flags = 0,
+        .input_zero_point = 0,
+    },
+};
+
+static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; }
+
+static inline const conv_layer_config_t* get_conv_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL;
+    return &CONV_LAYER_CONFIGS[layer_id];
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_params(
+    int ic, int ih, int iw,
+    int oc, int kh, int kw,
+    int oh, int ow,
+    int sy, int sx,
+    int pad, int dil)
+{
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->src_dim3_size == ic &&
+            cfg->src_dim2_size == ih &&
+            cfg->src_dim1_size == iw &&
+            cfg->dst_dim3_size == oc &&
+            cfg->coeff_dim2_size == kh &&
+            cfg->coeff_dim1_size == kw &&
+            cfg->dst_dim2_size == oh &&
+            cfg->dst_dim1_size == ow &&
+            cfg->stride_y == sy &&
+            cfg->stride_x == sx &&
+            cfg->padding == pad &&
+            cfg->dilation == dil)
+            return cfg;
+    }
+    return NULL;
+}
+
+static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) {
+    if (config_key == NULL) return NULL;
+    for (int i = 0; i < NUM_CONV_LAYERS; i++) {
+        const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i];
+        if (cfg->config_key != NULL) {
+            const char* a = config_key;
+            const char* b = cfg->config_key;
+            while (*a && *b && *a == *b) { a++; b++; }
+            if (*a == '\0' && *b == '\0') return cfg;
+        }
+    }
+    return NULL;
+}
+
+/* ====================================================================== */
+/*  MaxPool configurations                                             */
+/* ====================================================================== */
+
+typedef struct {
+    int layer_id;
+    const char* layer_name;
+    const char* config_key;
+
+    int src_width;  int src_height;  int channels;
+    int dst_width;  int dst_height;
+
+    int src_row_pitch;  int src_plane_pitch;
+    int dst_row_pitch;  int dst_plane_pitch;
+
+    int kernel_h;  int kernel_w;
+    int stride_h;  int stride_w;
+    int pad_h;     int pad_w;
+
+    int in_tile_w;      int in_tile_rows;   int in_tile_plane;
+    int in_data_offset;
+    int out_tile_w;     int out_tile_rows;  int out_tile_plane;
+
+    int c_tile_size;  int c_tiles;  int c_tile_size_last;
+    int height_tiles; int output_rows; int input_rows;
+
+    int input_buffer_size;  int output_buffer_size;
+
+    int input_ping_dram;  int input_pong_dram;
+    int output_ping_dram; int output_pong_dram;
+} maxpool_layer_config_t;
+
+#define NUM_MAXPOOL_LAYERS 1
+
+static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = {
+    {
+        .layer_id = 0,
+        .layer_name = "maxpool_3x3s2_c64_32x32",
+        .config_key = "64_32_32_3_3_2_2_1_1",
+        .src_width = 32,
+        .src_height = 32,
+        .channels = 64,
+        .dst_width = 16,
+        .dst_height = 16,
+        .src_row_pitch = 32,
+        .src_plane_pitch = 1024,
+        .dst_row_pitch = 16,
+        .dst_plane_pitch = 256,
+        .kernel_h = 3,
+        .kernel_w = 3,
+        .stride_h = 2,
+        .stride_w = 2,
+        .pad_h = 1,
+        .pad_w = 1,
+        .in_tile_w = 34,
+        .in_tile_rows = 5,
+        .in_tile_plane = 170,
+        .in_data_offset = 35,
+        .out_tile_w = 16,
+        .out_tile_rows = 1,
+        .out_tile_plane = 16,
+        .c_tile_size = 64,
+        .c_tiles = 1,
+        .c_tile_size_last = 64,
+        .height_tiles = 16,
+        .output_rows = 1,
+        .input_rows = 3,
+        .input_buffer_size = 43520,
+        .output_buffer_size = 4096,
+        .input_ping_dram = 0,
+        .input_pong_dram = 1,
+        .output_ping_dram = 1,
+        .output_pong_dram = 0,
+    },
+};
+
+static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; }
+
+static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) {
+    if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL;
+    return &MAXPOOL_LAYER_CONFIGS[layer_id];
+}
+
+static inline const maxpool_layer_config_t* get_maxpool_config_by_params(
+    int channels, int src_height, int src_width,
+    int kernel_h, int kernel_w,
+    int stride_h, int stride_w,
+    int pad_h, int pad_w)
+{
+    for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) {
+        const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i];
+        if (c->channels   == channels   &&
+            c->src_height == src_height &&
+            c->src_width  == src_width  &&
+            c->kernel_h   == kernel_h   &&
+            c->kernel_w   == kernel_w   &&
+            c->stride_h   == stride_h   &&
+            c->stride_w   == stride_w   &&
+            c->pad_h      == pad_h      &&
+            c->pad_w      == pad_w)
+            return c;
+    }
+    return NULL;
+}
+
+#endif /* LAYER_CONFIGS_H */
diff --git a/backends/cadence/vision/operators/maxpool/maxpool_exec_mxnj2.c b/backends/cadence/vision/operators/maxpool/maxpool_exec_mxnj2.c
new file mode 100644
index 00000000000..3d841517606
--- /dev/null
+++ b/backends/cadence/vision/operators/maxpool/maxpool_exec_mxnj2.c
@@ -0,0 +1,352 @@
+#include "maxpool_executors.h"
+#include "memory_manager.h"
+#include "dma.h"
+#include <string.h>
+#include <xtensa/hal.h>
+
+/* Minimal float definitions to avoid pulling in full math.h */
+#ifndef MIN_FLT32
+#define MIN_FLT32 (-3.402823466e+38F)
+#endif
+
+/* HW-optimised maxpool kernel (in library) */
+extern void maxpool2d_j2x2_f32(
+    float* restrict ptr_out,
+    const float* restrict ptr_inp,
+    int inp_height, int inp_width,
+    int out_height, int out_width,
+    int in_pitch_width, int in_pitch_height,
+    int out_pitch_width, int out_pitch_height,
+    unsigned char kernel_height,
+    unsigned char kernel_width);
+
+/* ---------------------------------------------------------------------- */
+/* Helper: fill a float buffer with a constant value (e.g. MIN_FLT32)     */
+/* ---------------------------------------------------------------------- */
+static void fill_buffer_f32(float* buf, float val, int count)
+{
+    for (int i = 0; i < count; i++) {
+        buf[i] = val;
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+/* Helper: swap two pointers                                               */
+/* ---------------------------------------------------------------------- */
+static inline void swap_f32_ptrs(float** a, float** b)
+{
+    float* t = *a;
+    *a = *b;
+    *b = t;
+}
+
+/* ====================================================================== */
+/* DMA-tiled executor                                                      */
+/* ====================================================================== */
+XAI_ERR_TYPE maxpool_exec_mxnj2(
+    float* src,
+    float* dst,
+    const maxpool_layer_config_t* config)
+{
+    /* ================================================================== */
+    /* SECTION 1: DRAM Buffer Allocation                                   */
+    /* ================================================================== */
+    int dram0_used = 0;
+    int dram1_used = 0;
+
+    int8_t* raw_in0 = allocate_dram_buffer(config->input_buffer_size,
+                                            config->input_ping_dram,
+                                            &dram0_used, &dram1_used);
+    int8_t* raw_in1 = allocate_dram_buffer(config->input_buffer_size,
+                                            config->input_pong_dram,
+                                            &dram0_used, &dram1_used);
+    int8_t* raw_out0 = allocate_dram_buffer(config->output_buffer_size,
+                                             config->output_ping_dram,
+                                             &dram0_used, &dram1_used);
+    int8_t* raw_out1 = allocate_dram_buffer(config->output_buffer_size,
+                                             config->output_pong_dram,
+                                             &dram0_used, &dram1_used);
+
+    if (!raw_in0 || !raw_in1 || !raw_out0 || !raw_out1) {
+        return (-1);
+    }
+
+    /* Cast to float pointers for kernel calls */
+    float* p_input0  = (float*)raw_in0;
+    float* p_input1  = (float*)raw_in1;
+    float* p_output0 = (float*)raw_out0;
+    float* p_output1 = (float*)raw_out1;
+
+    /* ================================================================== */
+    /* SECTION 2: Initialise DMA engines                                   */
+    /* ================================================================== */
+    dma_3dm_init(1);   /* ch1: 3D input prefetch  */
+    dma_2dm_init(0);   /* ch0: 2D output writeback */
+
+    /* ================================================================== */
+    /* SECTION 3: Load first input tile                                    */
+    /* ================================================================== */
+    /*
+     * The first tile starts at source row 0.  For kernels with pad_h > 0
+     * the buffer is pre-filled with MIN_FLT32 (identity for max) and
+     * data is placed at in_data_offset = pad_h*in_tile_w + pad_w, so the
+     * leading MIN_FLT32 rows/columns act as top/left padding.
+     *
+     * For subsequent tiles the DMA offset is recomputed per-tile to
+     * account for kernel overlap (kernel_h > stride_h).
+     */
+    fill_buffer_f32(p_input0, MIN_FLT32,
+                    config->c_tile_size * config->in_tile_plane);
+
+    /*
+     * Compute actual source rows for tile 0.
+     * Conceptual first input row = 0*stride_h - pad_h = -pad_h.
+     * top_pad rows are supplied by the MIN_FLT32 fill.
+     */
+    int first_in_end = (config->output_rows - 1) * config->stride_h
+                       - config->pad_h + config->kernel_h - 1;
+    int first_load_rows = (first_in_end >= config->src_height
+                           ? config->src_height - 1 : first_in_end)
+                          - 0 + 1;  /* src starts at row 0 */
+
+    /* First DMA: c_tile_size planes, first_load_rows rows each */
+    dma_3dm(1,
+            /* src */            (void*)src,
+            /* dst */            (void*)&p_input0[config->in_data_offset],
+            /* src_row_pitch */  config->src_width  * (int)sizeof(float),
+            /* dst_row_pitch */  config->in_tile_w  * (int)sizeof(float),
+            /* src_tile_pitch */ config->src_plane_pitch * (int)sizeof(float),
+            /* dst_tile_pitch */ config->in_tile_plane   * (int)sizeof(float),
+            /* row_sz */         config->src_width * (int)sizeof(float),
+            /* nrows */          first_load_rows,
+            /* ntiles */         config->c_tile_size);
+
+    idma_hw_wait_all(1);  /* input ready */
+
+    /* ================================================================== */
+    /* SECTION 4: Tiled Execution Loop  (C-tiles x H-tiles)                */
+    /* ================================================================== */
+    int last_tile = 1;
+
+    for (int idx_c = 0; idx_c < config->c_tiles; idx_c++) {
+        int last_c_tile = (last_tile) && (idx_c == config->c_tiles - 1);
+        int current_c = (idx_c < config->c_tiles - 1)
+                         ? config->c_tile_size
+                         : config->c_tile_size_last;
+
+        for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) {
+            int last_h_tile = (last_c_tile) &&
+                              (idx_h == config->height_tiles - 1);
+
+            /* Output rows for this tile (last tile may be shorter) */
+            int cur_out_rows = (idx_h < config->height_tiles - 1)
+                               ? config->output_rows
+                               : (config->dst_height -
+                                  config->output_rows * idx_h);
+            int cur_in_rows  = cur_out_rows * config->stride_h;
+
+            /* ========================================================== */
+            /* Prefetch next input tile into pong buffer                    */
+            /* ========================================================== */
+            if (!last_h_tile) {
+                /* Determine next (c, h) indices */
+                int next_c = idx_c;
+                int next_h = idx_h + 1;
+                if (next_h >= config->height_tiles) {
+                    next_h = 0;
+                    next_c = idx_c + 1;
+                }
+                int next_c_start = config->c_tile_size * next_c;
+                int next_c_size  = (next_c < config->c_tiles - 1)
+                                    ? config->c_tile_size
+                                    : config->c_tile_size_last;
+
+                /*
+                 * Compute source-row start, load count, and DMA
+                 * destination offset for the next height tile.
+                 *
+                 * For kernel_h > stride_h (e.g. 3x3/s2) consecutive
+                 * tiles overlap in the source by (kernel_h - stride_h)
+                 * rows, so the stride between tiles in source space is
+                 * output_rows * stride_h, NOT input_rows.
+                 */
+                int next_out_start = config->output_rows * next_h;
+                int next_in_first  = next_out_start * config->stride_h
+                                     - config->pad_h;
+                int next_top_pad   = (next_in_first < 0)
+                                     ? -next_in_first : 0;
+                int next_src_row   = next_in_first + next_top_pad;
+
+                int next_actual_out =
+                    (next_h < config->height_tiles - 1)
+                    ? config->output_rows
+                    : (config->dst_height - next_out_start);
+                int next_in_last =
+                    (next_out_start + next_actual_out - 1)
+                    * config->stride_h
+                    - config->pad_h + config->kernel_h - 1;
+                int next_in_end_clamped =
+                    (next_in_last >= config->src_height)
+                    ? config->src_height - 1
+                    : next_in_last;
+                int next_load_rows = next_in_end_clamped
+                                     - next_src_row + 1;
+
+                /* DMA offset: top_pad rows of MIN_FLT32 + left pad */
+                int next_dma_offset = next_top_pad * config->in_tile_w
+                                      + config->pad_w;
+
+                fill_buffer_f32(p_input1, MIN_FLT32,
+                                next_c_size * config->in_tile_plane);
+
+                dma_3dm(1,
+                    /* src */
+                    (void*)&src[next_c_start * config->src_plane_pitch +
+                                next_src_row * config->src_width],
+                    /* dst */
+                    (void*)&p_input1[next_dma_offset],
+                    /* src_row_pitch */
+                    config->src_width * (int)sizeof(float),
+                    /* dst_row_pitch */
+                    config->in_tile_w * (int)sizeof(float),
+                    /* src_tile_pitch */
+                    config->src_plane_pitch * (int)sizeof(float),
+                    /* dst_tile_pitch */
+                    config->in_tile_plane * (int)sizeof(float),
+                    /* row_sz */
+                    config->src_width * (int)sizeof(float),
+                    /* nrows */
+                    next_load_rows,
+                    /* ntiles */
+                    next_c_size);
+            }
+
+            /* ========================================================== */
+            /* Execute maxpool on current input tile                        */
+            /* ========================================================== */
+            for (int c = 0; c < current_c; c++) {
+                /*
+                 * Pass the kernel a pointer to the START of the
+                 * padded tile (row 0, col 0 of the buffer).  The
+                 * MIN_FLT32 fill provides top/left/right/bottom
+                 * padding; the kernel reads through them naturally
+                 * via its ky/kx loops.
+                 *
+                 * NOTE: the old code added in_data_offset here,
+                 * which skipped past the padding and produced wrong
+                 * results for any kernel with pad_h or pad_w > 0.
+                 */
+                float* in_plane  = &p_input0[c * config->in_tile_plane];
+                float* out_plane = &p_output1[c * config->out_tile_plane];
+
+                maxpool2d_j2x2_f32(
+                    out_plane,
+                    in_plane,
+                    cur_in_rows,            /* inp_height    */
+                    config->src_width,      /* inp_width     */
+                    cur_out_rows,           /* out_height    */
+                    config->dst_width,      /* out_width     */
+                    config->in_tile_w,      /* in_pitch_width  */
+                    config->in_tile_plane,  /* in_pitch_height */
+                    config->dst_width,      /* out_pitch_width */
+                    config->out_tile_plane, /* out_pitch_height*/
+                    (unsigned char)config->kernel_h,
+                    (unsigned char)config->kernel_w);
+            }
+
+            /* ========================================================== */
+            /* Write output tile back to system memory via 2D DMA          */
+            /* ========================================================== */
+            {
+                int c_start  = config->c_tile_size * idx_c;
+                int h_out_start = config->output_rows * idx_h;
+                int row_bytes = config->dst_width * cur_out_rows
+                                * (int)sizeof(float);
+
+                dma_2dm(0,
+                    /* src */        (void*)p_output1,
+                    /* dst */        (void*)&dst[c_start *
+                                        config->dst_plane_pitch +
+                                        h_out_start * config->dst_width],
+                    /* src_stride */ config->out_tile_plane *
+                                        (int)sizeof(float),
+                    /* dst_stride */ config->dst_plane_pitch *
+                                        (int)sizeof(float),
+                    /* row_size */   row_bytes,
+                    /* num_lines */  (short)current_c);
+            }
+
+            /* Swap ping-pong buffers */
+            swap_f32_ptrs(&p_output0, &p_output1);
+            swap_f32_ptrs(&p_input0,  &p_input1);
+        }
+    }
+
+    /* Wait for last output DMA before returning */
+    idma_hw_wait_all(0);
+
+    return XAI_ERR_OK;
+}
+
+/* ====================================================================== */
+/* Cache-mode fallback (no DMA, data accessed via processor cache)         */
+/* ====================================================================== */
+XAI_ERR_TYPE maxpool_exec_mxnj2_no_dma(
+    float* src,
+    float* dst,
+    const maxpool_layer_config_t* config)
+{
+    int padded_w = config->src_width  + 2 * config->pad_w;
+    int padded_h = config->src_height + 2 * config->pad_h;
+    int plane_size = padded_w * padded_h;
+    int total_size = plane_size * config->channels;
+
+    /* Use shared padded-input scratch buffer from memory manager */
+    int8_t* raw_buf = get_cache_padded_input();
+    if (total_size * (int)sizeof(float) > (int)get_cache_padded_input_size()) {
+        return (-1);   /* buffer too small */
+    }
+    float* padded = (float*)raw_buf;
+
+    /* Fill with MIN_FLT32 (identity for max) */
+    fill_buffer_f32(padded, MIN_FLT32, total_size);
+
+    /* Copy source data into padded buffer at correct offset */
+    int data_off = config->pad_h * padded_w + config->pad_w;
+
+    for (int c = 0; c < config->channels; c++) {
+        for (int h = 0; h < config->src_height; h++) {
+            memcpy(&padded[c * plane_size + data_off + h * padded_w],
+                   &src[c * config->src_plane_pitch + h * config->src_width],
+                   config->src_width * sizeof(float));
+        }
+    }
+
+    /* Run maxpool per channel plane.
+     * Pass the pointer at the START of the padded buffer (row 0, col 0)
+     * so the kernel's ky/kx loops read through the MIN_FLT32 padding. */
+    for (int c = 0; c < config->channels; c++) {
+        float* in_plane  = &padded[c * plane_size];
+        float* out_plane = &dst[c * config->dst_plane_pitch];
+
+        maxpool2d_j2x2_f32(
+            out_plane,
+            in_plane,
+            config->src_height,
+            config->src_width,
+            config->dst_height,
+            config->dst_width,
+            padded_w,
+            plane_size,
+            config->dst_width,
+            config->dst_plane_pitch,
+            (unsigned char)config->kernel_h,
+            (unsigned char)config->kernel_w);
+    }
+
+    /* Writeback output from cache */
+    xthal_dcache_region_writeback(dst,
+        config->dst_plane_pitch * config->channels * (int)sizeof(float));
+
+    return XAI_ERR_OK;
+}
diff --git a/backends/cadence/vision/operators/maxpool/maxpool_executors.h b/backends/cadence/vision/operators/maxpool/maxpool_executors.h
new file mode 100644
index 00000000000..90c44258bbb
--- /dev/null
+++ b/backends/cadence/vision/operators/maxpool/maxpool_executors.h
@@ -0,0 +1,61 @@
+/*
+ * maxpool_executors.h
+ *
+ *  Created on: Apr 21, 2026
+ *      Author: Suraj Raut
+ *
+ *  Description:
+ *      Function declarations for DMA-tiled maxpool executors.
+ *      Parallels conv/kernel_executors.h for the maxpool operator.
+ */
+
+#ifndef MAXPOOL_EXECUTORS_H_
+#define MAXPOOL_EXECUTORS_H_
+
+#include "../layer_configs.h"
+
+#ifndef XAI_ERR_TYPE
+typedef int XAI_ERR_TYPE;
+#define XAI_ERR_OK 0
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Execute MxN stride-2 maxpool with DMA tiling.
+ *
+ * Operates on float32 data in NCHW layout (one batch at a time).
+ * Uses ping-pong DMA transfers on local DRAM for overlap of
+ * DMA and computation.
+ *
+ * @param src   System-memory pointer to input  [C x H x W] float32
+ * @param dst   System-memory pointer to output [C x OH x OW] float32
+ * @param config  Pre-computed layer configuration (buffer sizes, tiling, etc.)
+ * @return XAI_ERR_OK on success
+ */
+XAI_ERR_TYPE maxpool_exec_mxnj2(
+    float* src,
+    float* dst,
+    const maxpool_layer_config_t* config);
+
+/**
+ * Execute MxN stride-2 maxpool without DMA (data accessed via processor cache).
+ * Fallback path when DRAM buffers are not available.
+ *
+ * @param src   System-memory pointer to input  [C x H x W] float32
+ * @param dst   System-memory pointer to output [C x OH x OW] float32
+ * @param config  Layer configuration (only dimension fields used)
+ * @return XAI_ERR_OK on success
+ */
+XAI_ERR_TYPE maxpool_exec_mxnj2_no_dma(
+    float* src,
+    float* dst,
+    const maxpool_layer_config_t* config);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* MAXPOOL_EXECUTORS_H_ */
diff --git a/backends/cadence/vision/operators/mean/mean_exec_dma.c b/backends/cadence/vision/operators/mean/mean_exec_dma.c
new file mode 100644
index 00000000000..e732fede559
--- /dev/null
+++ b/backends/cadence/vision/operators/mean/mean_exec_dma.c
@@ -0,0 +1,149 @@
+#include "mean_executors.h"
+#include "memory_manager.h"
+#include "dma.h"
+#include <xtensa/hal.h>
+
+/* SIMD mean kernel (in library) */
+extern void simd_mean_pool_2x2_to_1x1_float32(
+    float* restrict output,
+    const float* restrict input,
+    int N);
+
+/* ---------------------------------------------------------------------- */
+/* Helper: swap two float pointers                                         */
+/* ---------------------------------------------------------------------- */
+static inline void swap_ptrs(float** a, float** b)
+{
+    float* t = *a; *a = *b; *b = t;
+}
+
+/* ====================================================================== */
+/* DMA-tiled mean executor with ping-pong                                  */
+/* ====================================================================== */
+XAI_ERR_TYPE mean_exec_dma(
+    const float* src,
+    float* dst,
+    int channels,
+    int spatial_h,
+    int spatial_w)
+{
+    int spatial = spatial_h * spatial_w;  /* e.g. 4 for 2x2 */
+
+    /* ================================================================== */
+    /* Compute tiling: how many channels per chunk?                         */
+    /*                                                                      */
+    /* Each DRAM bank holds one ping or pong set:                           */
+    /*   input_chunk  = chunk_ch * spatial * sizeof(float)                  */
+    /*   output_chunk = chunk_ch * sizeof(float)                            */
+    /*   total = chunk_ch * (spatial + 1) * 4                               */
+    /*                                                                      */
+    /* chunk_ch must be a multiple of 16 (SIMD processes 16 ch/iteration).  */
+    /* ================================================================== */
+    int bytes_per_ch = (spatial + 1) * (int)sizeof(float);
+    int chunk_ch = IDMA_BUFFER_SIZE_DRAM0 / bytes_per_ch;
+    chunk_ch = (chunk_ch / 16) * 16;  /* round down to SIMD multiple */
+
+    if (chunk_ch < 16) {
+        return (-1);  /* DRAM too small */
+    }
+
+    /* Cap to actual channel count (round up to multiple of 16 for last tile) */
+    if (chunk_ch > channels) {
+        chunk_ch = ((channels + 15) / 16) * 16;
+    }
+
+    int inp_chunk_bytes = chunk_ch * spatial * (int)sizeof(float);
+    int out_chunk_bytes = chunk_ch * (int)sizeof(float);
+
+    /* ================================================================== */
+    /* Buffer allocation: ping in DRAM0, pong in DRAM1                      */
+    /* Each bank: [ input_chunk | output_chunk ]                            */
+    /* ================================================================== */
+    float* inp_ping = (float*)dram0_pool;
+    float* out_ping = (float*)(dram0_pool + inp_chunk_bytes);
+    float* inp_pong = (float*)dram1_pool;
+    float* out_pong = (float*)(dram1_pool + inp_chunk_bytes);
+
+    /* ================================================================== */
+    /* Initialise DMA engines                                               */
+    /* ================================================================== */
+    dma_2dm_init(0);   /* ch0: output writeback */
+    dma_2dm_init(1);   /* ch1: input prefetch   */
+
+    /* ================================================================== */
+    /* Load first input chunk (serial — no overlap possible)                */
+    /* ================================================================== */
+    int ch_done = 0;
+    int cur_ch = (channels - ch_done > chunk_ch)
+                 ? chunk_ch
+                 : channels - ch_done;
+    int cur_inp_bytes = cur_ch * spatial * (int)sizeof(float);
+
+    dma_1dm(1, (void*)&src[ch_done * spatial], (void*)inp_ping, cur_inp_bytes);
+    idma_hw_wait_all(1);
+
+    /* ================================================================== */
+    /* Tiled execution loop with ping-pong                                  */
+    /* ================================================================== */
+    float* p_inp_cur  = inp_ping;
+    float* p_out_cur  = out_ping;
+    float* p_inp_next = inp_pong;
+    float* p_out_next = out_pong;
+
+    while (ch_done < channels) {
+        int this_ch = cur_ch;
+        int next_ch_start = ch_done + this_ch;
+        int have_next = (next_ch_start < channels);
+
+        /* ============================================================== */
+        /* Prefetch next input chunk into pong buffer (async)               */
+        /* ============================================================== */
+        int next_ch = 0;
+        if (have_next) {
+            next_ch = (channels - next_ch_start > chunk_ch)
+                      ? chunk_ch
+                      : channels - next_ch_start;
+            int next_inp_bytes = next_ch * spatial * (int)sizeof(float);
+
+            dma_1dm(1, (void*)&src[next_ch_start * spatial],
+                    (void*)p_inp_next, next_inp_bytes);
+            /* DMA runs in background while we compute below */
+        }
+
+        /* ============================================================== */
+        /* Execute SIMD mean on current chunk                               */
+        /* ============================================================== */
+        simd_mean_pool_2x2_to_1x1_float32(
+            p_out_cur,
+            p_inp_cur,
+            this_ch * spatial);
+
+        /* ============================================================== */
+        /* Write output chunk to system memory (async)                      */
+        /* ============================================================== */
+        int cur_out_bytes = this_ch * (int)sizeof(float);
+        dma_1dm(0, (void*)p_out_cur,
+                (void*)&dst[ch_done], cur_out_bytes);
+
+        /* ============================================================== */
+        /* Wait for input DMA of next tile to finish                        */
+        /* (In a well-tuned pipeline, DMA finishes during compute above)    */
+        /* ============================================================== */
+        if (have_next) {
+            idma_hw_wait_all(1);
+        }
+
+        /* Wait for output DMA before reusing this buffer as next pong */
+        idma_hw_wait_all(0);
+
+        /* Advance */
+        ch_done = next_ch_start;
+        cur_ch  = next_ch;
+
+        /* Swap ping-pong: current pong becomes next ping */
+        swap_ptrs(&p_inp_cur,  &p_inp_next);
+        swap_ptrs(&p_out_cur,  &p_out_next);
+    }
+
+    return XAI_ERR_OK;
+}
diff --git a/backends/cadence/vision/operators/mean/mean_executors.h b/backends/cadence/vision/operators/mean/mean_executors.h
new file mode 100644
index 00000000000..d56b45b4dc7
--- /dev/null
+++ b/backends/cadence/vision/operators/mean/mean_executors.h
@@ -0,0 +1,51 @@
+/*
+ * mean_executors.h
+ *
+ *  Created on: Apr 22, 2026
+ *      Author: Suraj Raut
+ *
+ *  Description:
+ *      Function declarations for DMA-tiled mean (adaptive_avg_pool2d) executors.
+ *      Parallels maxpool/maxpool_executors.h.
+ */
+
+#ifndef MEAN_EXECUTORS_H_
+#define MEAN_EXECUTORS_H_
+
+#ifndef XAI_ERR_TYPE
+typedef int XAI_ERR_TYPE;
+#define XAI_ERR_OK 0
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Execute mean pooling (adaptive_avg_pool2d) with DMA ping-pong tiling.
+ *
+ * Reduces [C x H x W] float32 to [C] by averaging all spatial elements.
+ * Currently optimized for H=2, W=2 (calls simd_mean_pool_2x2_to_1x1_float32).
+ *
+ * Uses ping-pong DMA: prefetches next input chunk while computing on current.
+ * Channel tiles are rounded to 16 for SIMD alignment.
+ *
+ * @param src        System-memory pointer to input  [C x H x W] float32
+ * @param dst        System-memory pointer to output [C] float32
+ * @param channels   Number of channels
+ * @param spatial_h  Spatial height (must be 2 for optimized path)
+ * @param spatial_w  Spatial width  (must be 2 for optimized path)
+ * @return XAI_ERR_OK on success, -1 if buffers unavailable
+ */
+XAI_ERR_TYPE mean_exec_dma(
+    const float* src,
+    float* dst,
+    int channels,
+    int spatial_h,
+    int spatial_w);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* MEAN_EXECUTORS_H_ */
diff --git a/backends/cadence/vision/operators/op_add.cpp b/backends/cadence/vision/operators/op_add.cpp
index 81014143275..8c76378618c 100644
--- a/backends/cadence/vision/operators/op_add.cpp
+++ b/backends/cadence/vision/operators/op_add.cpp
@@ -6,67 +6,327 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <lib.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
 using executorch::aten::Scalar;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
-using executorch::runtime::canCast;
+using executorch::runtime::can_cast;
 using executorch::runtime::KernelRuntimeContext;
-using executorch::runtime::promoteTypes;
-using torch::executor::apply_binary_elementwise_fn;
 using torch::executor::Error;
-using torch::executor::native::utils::extract_scalar;
 
 namespace impl {
 namespace vision {
 namespace native {
 
+// Forward declaration of hardware-optimized vector addition function
+extern "C" void rvaddf(
+    float32_t* restrict z,
+    const float32_t* restrict x,
+    const float32_t* restrict y,
+    int N);
+
 Tensor& add_out(
     KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     const Scalar& alpha,
     Tensor& out) {
-  (void)ctx;
-
-  using namespace torch::executor::native::utils;
-
-  ScalarType a_type = a.scalar_type();
-  ScalarType b_type = b.scalar_type();
-  ScalarType common_type = promoteTypes(a_type, b_type);
-  ScalarType out_type = out.scalar_type();
-
-  ET_CHECK_MSG(a_type == ScalarType::Float, "Input tensor not a float.\n");
-  ET_CHECK_MSG(b_type == ScalarType::Float, "Input tensor not a float.\n");
-  ET_CHECK_MSG(out_type == ScalarType::Float, "Output tensor not a float.\n");
-
-  ET_CHECK(canCast(common_type, out_type));
-
-  using CTYPE_A = float;
-  using CTYPE_B = float;
-  using CTYPE_IN = float;
-  using CTYPE_OUT = float;
-  CTYPE_IN alpha_val;
-  ET_CHECK_MSG(
-      extract_scalar(alpha, &alpha_val),
-      "Could not be extracted: wrong type or out of range");
-
-  apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-      [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
-        CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-        CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-        CTYPE_IN value = a_casted + alpha_val * b_casted;
-
-        return static_cast<CTYPE_OUT>(value);
-      },
-      a,
-      b,
-      out);
+  
+  // Check if we can use optimized path: same shape, float32, alpha=1.0
+  bool same_shape = executorch::runtime::tensors_have_same_shape(a, b) &&
+                    executorch::runtime::tensors_have_same_shape(a, out);
+  bool is_float = (a.scalar_type() == ScalarType::Float) &&
+                  (b.scalar_type() == ScalarType::Float) &&
+                  (out.scalar_type() == ScalarType::Float);
+  
+  // Extract alpha value to check if it's 1.0
+  float alpha_val = 1.0f;
+  bool alpha_is_one = false;
+  if (is_float && torch::executor::native::utils::extract_scalar(alpha, &alpha_val)) {
+    alpha_is_one = (alpha_val == 1.0f);
+  }
+
+  size_t numel = out.numel();
+  
+  // Use optimized path if: float32, same shape, alpha=1.0, sufficient size, aligned
+  // Require numel to be even (2 floats = 8 bytes) for 8-byte aligned DMA
+  bool use_optimized = same_shape && is_float && alpha_is_one &&
+                       (numel >= 8) && ((numel % 2) == 0);
+
+  if (use_optimized) {
+
+    const float* a_data = a.const_data_ptr<float>();
+    const float* b_data = b.const_data_ptr<float>();
+    float* out_data = out.mutable_data_ptr<float>();
+
+    // Check if source data is 8-byte aligned (required for DMA)
+    bool src_aligned = (((uintptr_t)a_data & 0x7) == 0) &&
+                       (((uintptr_t)b_data & 0x7) == 0) &&
+                       (((uintptr_t)out_data & 0x7) == 0);
+
+    // DMA setup for two inputs + one output
+    bool ping_pong_process = false;
+    bool ping_process_pong = false;
+    size_t chunk_size = 0;
+
+    float32_t* inp_a_buff[2];
+    float32_t* inp_b_buff[2];
+    float32_t* out_buff[2];
+
+    // Check if DRAM buffers are available
+    bool dram0_available = (ptr_dram0 != nullptr) && (IDMA_BUFFER_SIZE_DRAM0 > 0);
+    bool dram1_available = (ptr_dram1 != nullptr) && (IDMA_BUFFER_SIZE_DRAM1 > 0);
+    
+    // DMA threshold - beneficial for larger tensors
+    const size_t DMA_THRESHOLD = 1024;
+    bool use_dma = (numel >= DMA_THRESHOLD) && src_aligned;
+    
+    // Strategy 1: Ping-pong processing (2 sets of buffers)
+    // Need to fit: 2 inputs + 1 output per buffer (3 float32 arrays total)
+    // Split: 33% input_a, 33% input_b, 33% output per DRAM
+    if (use_dma && dram0_available && dram1_available && (numel >= 2)) {
+      // Try 128-byte alignment first (optimal for rvaddf SIMD)
+      size_t per_array_128 = (IDMA_BUFFER_SIZE_DRAM0 / 3) & ~0x7F;  // 128-byte alignment
+      size_t chunk_elements_128 = per_array_128 / FLT32_SIZE;
+      
+      // If 128-byte alignment gives us 0 chunks, try 8-byte alignment (minimum for float32)
+      size_t per_array = per_array_128;
+      size_t chunk_elements = chunk_elements_128;
+      
+      if (chunk_elements == 0) {
+        per_array = (IDMA_BUFFER_SIZE_DRAM0 / 3) & ~0x7;  // Fallback to 8-byte alignment
+        chunk_elements = per_array / FLT32_SIZE;
+      }
+      
+      if (chunk_elements == 0) {
+        // Verify all buffers are 8-byte aligned
+        if (((uintptr_t)ptr_dram0 & 0x7) != 0 || ((uintptr_t)ptr_dram1 & 0x7) != 0) {
+          // Buffer base addresses not aligned, fall back to non-DMA
+          use_dma = false;
+        }
+      } else {
+        // DRAM0: input_a[0] | input_b[0] | output[0] (all 128-byte aligned)
+        inp_a_buff[0] = (float32_t*)ptr_dram0;
+        inp_b_buff[0] = (float32_t*)((uint8_t*)ptr_dram0 + per_array);
+        out_buff[0] = (float32_t*)((uint8_t*)ptr_dram0 + 2 * per_array);
+        
+        // DRAM1: input_a[1] | input_b[1] | output[1] (all 8-byte aligned)
+        inp_a_buff[1] = (float32_t*)ptr_dram1;
+        inp_b_buff[1] = (float32_t*)((uint8_t*)ptr_dram1 + per_array);
+        out_buff[1] = (float32_t*)((uint8_t*)ptr_dram1 + 2 * per_array);
+        
+        chunk_size = chunk_elements;
+        ping_pong_process = true;
+      }
+    }
+    
+    // Strategy 2: Ping-process-pong (1 set of buffers)
+    // Use DRAM0 entirely for inputs (50% a, 50% b), DRAM1 for output
+    if (use_dma && !ping_pong_process && dram0_available && dram1_available) {
+      size_t inp_per_array = (IDMA_BUFFER_SIZE_DRAM0 / 2) & ~0x7;  // Round down to 8-byte boundary
+      size_t inp_capacity = inp_per_array / FLT32_SIZE;
+      size_t out_capacity = IDMA_BUFFER_SIZE_DRAM1 / FLT32_SIZE;
+      
+      if ((inp_capacity > 0) && (out_capacity >= inp_capacity)) {
+        inp_a_buff[0] = (float32_t*)ptr_dram0;
+        inp_b_buff[0] = (float32_t*)((uint8_t*)ptr_dram0 + inp_per_array);
+        out_buff[0] = (float32_t*)ptr_dram1;
+        
+        chunk_size = (inp_capacity < out_capacity) ? inp_capacity : out_capacity;
+        ping_process_pong = true;
+      }
+    }
+
+    if (ping_pong_process || ping_process_pong) {
+      // Writeback inputs from cache to system memory before DMA reads
+      xthal_dcache_region_writeback((void*)a_data, FLT32_SIZE * numel);
+      xthal_dcache_region_writeback((void*)b_data, FLT32_SIZE * numel);
+
+      /* Initialize DMA Channel 0 (loads) and Channel 1 (stores) */
+      dma_2dm_init(0);
+      dma_2dm_init(1);
+
+      if (ping_pong_process) {
+        // Ping-pong processing for better throughput
+        size_t num_chunks = (numel + chunk_size - 1) / chunk_size;
+        if (num_chunks == 0) num_chunks = 1;
+
+        int32_t pp_swap = 0;
+
+        const float* ptr_a = a_data;
+        const float* ptr_b = b_data;
+        float* ptr_out = out_data;
+
+        // Load first chunk (both inputs) into buffer 0 via ch0
+        size_t current_chunk = (numel < chunk_size) ? numel : chunk_size;
+
+        dma_1dm(0, (void*)ptr_a, inp_a_buff[pp_swap], FLT32_SIZE * current_chunk);
+        dma_1dm(0, (void*)ptr_b, inp_b_buff[pp_swap], FLT32_SIZE * current_chunk);
+
+        size_t remaining = numel - current_chunk;
+        ptr_a += current_chunk;
+        ptr_b += current_chunk;
+
+        // Pipeline: load (ch0) and store (ch1) overlap with processing
+        for (size_t i = 0; i < num_chunks - 1; i++) {
+          size_t next_chunk = (remaining < chunk_size) ? remaining : chunk_size;
+
+          // Wait for current loads to complete
+          idma_hw_wait_all(0);
+
+          // Start loading next chunk into alternate buffer via ch0
+          dma_1dm(0, (void*)ptr_a, inp_a_buff[pp_swap ^ 1], FLT32_SIZE * next_chunk);
+          dma_1dm(0, (void*)ptr_b, inp_b_buff[pp_swap ^ 1], FLT32_SIZE * next_chunk);
+
+          // Process current buffer (ch0 loads next in parallel)
+          rvaddf(out_buff[pp_swap], inp_a_buff[pp_swap], inp_b_buff[pp_swap], (int)current_chunk);
+
+          // Wait for previous store to complete before reusing out_buff
+          idma_hw_wait_all(1);
+
+          // Store result via ch1
+          dma_1dm(1, out_buff[pp_swap], (void*)ptr_out, FLT32_SIZE * current_chunk);
+
+          ptr_a += next_chunk;
+          ptr_b += next_chunk;
+          ptr_out += current_chunk;
+          remaining -= next_chunk;
+          current_chunk = next_chunk;
+          pp_swap ^= 1;
+        }
+        
+        // Process last chunk
+        idma_hw_wait_all(0);
+        rvaddf(out_buff[pp_swap], inp_a_buff[pp_swap], inp_b_buff[pp_swap], (int)current_chunk);
+
+        idma_hw_wait_all(1);
+        dma_1dm(1, out_buff[pp_swap], (void*)ptr_out, FLT32_SIZE * current_chunk);
+        idma_hw_wait_all(1);
+
+        // Invalidate output cache: DMA wrote to system memory, cache may have stale data
+        xthal_dcache_region_invalidate(out_data, FLT32_SIZE * numel);
+        
+      } 
+      else if (ping_process_pong) {
+        // Sequential processing
+        size_t remaining = numel;
+        const float* ptr_a = a_data;
+        const float* ptr_b = b_data;
+        float* ptr_out = out_data;
+
+        while (remaining > 0) {
+          size_t current_chunk = (remaining < chunk_size) ? remaining : chunk_size;
+
+          // Load both input chunks via ch0 (overlaps with any pending ch1 store)
+          dma_1dm(0, (void*)ptr_a, inp_a_buff[0], FLT32_SIZE * current_chunk);
+          dma_1dm(0, (void*)ptr_b, inp_b_buff[0], FLT32_SIZE * current_chunk);
+          // Wait for previous store to complete
+          idma_hw_wait_all(1);
+          // Wait for loads to complete
+          idma_hw_wait_all(0);
+
+          // Process: out = a + b
+          rvaddf(out_buff[0], inp_a_buff[0], inp_b_buff[0], (int)current_chunk);
+
+          // Store result via ch1
+          dma_1dm(1, out_buff[0], (void*)ptr_out, FLT32_SIZE * current_chunk);
+
+          ptr_a += current_chunk;
+          ptr_b += current_chunk;
+          ptr_out += current_chunk;
+          remaining -= current_chunk;
+        }
+        idma_hw_wait_all(1);
+
+        // Invalidate output cache: DMA wrote to system memory, cache may have stale data
+        xthal_dcache_region_invalidate(out_data, FLT32_SIZE * numel);
+        
+      }
+    } else {
+      // Fallback: use hardware-optimized vector addition directly without DMA
+      // Writeback+invalidate inputs: ensures CPU-dirty data reaches system memory,
+      // then invalidate forces re-read from system memory (fresh data)
+      xthal_dcache_region_writeback((void*)a_data, FLT32_SIZE * numel);
+      xthal_dcache_region_invalidate((void*)a_data, FLT32_SIZE * numel);
+      xthal_dcache_region_writeback((void*)b_data, FLT32_SIZE * numel);
+      xthal_dcache_region_invalidate((void*)b_data, FLT32_SIZE * numel);
+      rvaddf(out_data, a_data, b_data, (int)numel);
+
+      // Writeback output from cache to system memory for DMA coherency
+      xthal_dcache_region_writeback(out_data, FLT32_SIZE * numel);
+
+    }
+  } else {
+    // Fallback: Use full generic portable implementation
+    // This handles: broadcasting, non-float dtypes, alpha!=1.0, small tensors, all corner cases
+    
+    
+    namespace utils = torch::executor::native::utils;
+    using torch::executor::check_alpha_type;
+    using torch::executor::promoteTypes;
+    using torch::executor::canCast;
+    using torch::executor::resize_to_broadcast_target_size;
+    using torch::executor::tensors_have_same_dim_order;
+    using torch::executor::Error;
+    
+    // Common Dtype
+    ScalarType common_type = promoteTypes(a.scalar_type(), b.scalar_type());
+
+    // Check Common Dtype
+    ET_KERNEL_CHECK(
+        ctx,
+        (canCast(common_type, out.scalar_type()) &&
+         check_alpha_type(utils::get_scalar_dtype(alpha), common_type)),
+        InvalidArgument,
+        out);
+
+    // Check Dim Order
+    ET_KERNEL_CHECK(
+        ctx, 
+        tensors_have_same_dim_order(a, b, out), 
+        InvalidArgument, 
+        out);
+
+    // Resize
+    ET_KERNEL_CHECK(
+        ctx,
+        resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+        InvalidArgument,
+        out);
+
+    // Compute Dtype
+    ScalarType compute_type = utils::get_compute_type(common_type);
+
+    // @lint-ignore CLANGTIDY facebook-hte-CArray
+    static constexpr const char op_name[] = "add.out";
 
+    ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+      CTYPE_COMPUTE val_alpha;
+      ET_KERNEL_CHECK(
+          ctx, utils::extract_scalar(alpha, &val_alpha), InvalidArgument, );
+      utils::apply_bitensor_elementwise_fn<
+          CTYPE_COMPUTE,
+          op_name,
+          utils::SupportedTensorDtypes::REALHBBF16>(
+          [val_alpha](const auto& val_a, const auto& val_b) {
+            return val_a + val_alpha * val_b;
+          },
+          ctx,
+          a,
+          utils::SupportedTensorDtypes::REALHBBF16,
+          b,
+          utils::SupportedTensorDtypes::REALHBBF16,
+          out);
+    });
+    
+  }
   return out;
 }
 
diff --git a/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp b/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp
index daffecda1bf..41aeb5c20d6 100644
--- a/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp
+++ b/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp
@@ -6,18 +6,28 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/vision/kernels/kernels.h>
+#include <lib.h>
+#include <executorch/backends/cadence/generic/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+using ::impl::generic::kernels::dequantize;
+
 namespace impl {
 namespace vision {
 namespace native {
 
-using executorch::aten::ScalarType;
-using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+// Forward declaration of hardware-optimized dequantize function
+extern "C" void dequantize_asym8s_f32(
+    float32_t* restrict ptr_out,
+    const int8_t* restrict ptr_inp,
+    float32_t scale,
+    int zero_bias,
+    int N);
 
-void dequantize_per_tensor_out(
+Tensor& dequantize_per_tensor_out(
     KernelRuntimeContext& context,
     const Tensor& input,
     double scale,
@@ -31,33 +41,284 @@ void dequantize_per_tensor_out(
 
   if (input.scalar_type() == ScalarType::Byte) {
     const uint8_t* input_data = input.const_data_ptr<uint8_t>();
-    kernels::dequantize<uint8_t>(
+    dequantize<uint8_t>(
         out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Char) {
     const int8_t* input_data = input.const_data_ptr<int8_t>();
-    kernels::dequantize<int8_t>(out_data, input_data, scale, zero_point, numel);
+    
+    // Hardware-optimized int8 dequantization with DMA support
+    bool ping_pong_process = false;
+    bool ping_process_pong = false;
+    size_t chunk_size = 0;
+
+    int8_t* inp_buff[2];
+    float32_t* out_buff[2];
+
+    // Check if DRAM buffers are available
+    bool dram0_available = (ptr_dram0 != nullptr) && (IDMA_BUFFER_SIZE_DRAM0 > 0);
+    bool dram1_available = (ptr_dram1 != nullptr) && (IDMA_BUFFER_SIZE_DRAM1 > 0);
+    
+    // DMA has overhead - only beneficial for larger tensors
+    // Threshold: 1024 elements (~1KB for int8, ~4KB for float32)
+    const size_t DMA_THRESHOLD = 1024;
+    bool use_dma = (numel >= DMA_THRESHOLD);
+
+    // Strategy 1: Try ping-pong processing (2 input + 2 output buffers)
+    // Using 20/80 split: 20% for int8 input, 80% for float32 output in each DRAM
+    if (use_dma && dram0_available && dram1_available && (numel >= 2)) {
+      size_t inp_per_buffer = (IDMA_BUFFER_SIZE_DRAM0 * 1) / 5;  // 20% for int8 input (in bytes)
+      size_t out_per_buffer = (IDMA_BUFFER_SIZE_DRAM0 * 4) / (5 * FLT32_SIZE);  // 80% for float32 output
+
+      // Check if 20/80 split fits in both DRAMs
+      if ((inp_per_buffer > 0) && 
+          (out_per_buffer >= inp_per_buffer) &&
+          ((IDMA_BUFFER_SIZE_DRAM0 * 1) / 5 + (IDMA_BUFFER_SIZE_DRAM0 * 4) / 5 <= IDMA_BUFFER_SIZE_DRAM0) &&
+          ((IDMA_BUFFER_SIZE_DRAM1 * 1) / 5 + (IDMA_BUFFER_SIZE_DRAM1 * 4) / 5 <= IDMA_BUFFER_SIZE_DRAM1)) {
+        
+        // Allocate buffers with 20/80 split
+        inp_buff[0] = (int8_t*)ptr_dram0;
+        out_buff[0] = (float32_t*)((uint8_t*)ptr_dram0 + (IDMA_BUFFER_SIZE_DRAM0 * 1) / 5);
+        
+        inp_buff[1] = (int8_t*)ptr_dram1;
+        out_buff[1] = (float32_t*)((uint8_t*)ptr_dram1 + (IDMA_BUFFER_SIZE_DRAM1 * 1) / 5);
+        
+        chunk_size = inp_per_buffer;
+        ping_pong_process = true;
+      }
+    }
+    
+    // Strategy 2: Fallback to ping-process-pong (1 input + 1 output buffer)
+    // Use full DRAM0 for input, full DRAM1 for output (no split needed)
+    if (use_dma && !ping_pong_process && dram0_available && dram1_available) {
+      size_t inp_capacity = IDMA_BUFFER_SIZE_DRAM0;  // Full DRAM0 for int8 input (in bytes)
+      size_t out_capacity = IDMA_BUFFER_SIZE_DRAM1 / FLT32_SIZE;  // Full DRAM1 for float32 output
+      
+      if ((inp_capacity > 0) && (out_capacity >= inp_capacity)) {
+        inp_buff[0] = (int8_t*)ptr_dram0;
+        out_buff[0] = (float32_t*)ptr_dram1;
+        
+        chunk_size = (inp_capacity < out_capacity) ? inp_capacity : out_capacity;
+        ping_process_pong = true;
+      }
+    }
+
+    if (ping_pong_process || ping_process_pong) {
+      const int8_t* ptr_inp = input_data;
+
+      // Writeback input from cache to system memory before DMA reads
+      xthal_dcache_region_writeback((void*)input_data, sizeof(int8_t) * numel);
+
+      /* Initialize DMA Channel 0 (loads) and Channel 1 (stores) */
+      dma_2dm_init(0);
+      dma_2dm_init(1);
+
+      if (ping_pong_process) {
+        // Ping-pong processing for better throughput
+        size_t num_chunks = (numel + chunk_size - 1) / chunk_size;
+        
+        if (num_chunks == 0) num_chunks = 1;
+
+        int32_t pp_swap = 0;
+
+        int8_t* ptr_in = (int8_t*)ptr_inp;
+        float32_t* ptr_out = out_data;
+
+        // Load first chunk via ch0
+        size_t current_chunk = (numel < chunk_size) ? numel : chunk_size;
+
+        dma_1dm(0, ptr_in, inp_buff[pp_swap], sizeof(int8_t) * current_chunk);
+
+        size_t remaining = numel - current_chunk;
+        ptr_in += current_chunk;
+
+        // Pipeline: load (ch0) and store (ch1) overlap with processing
+        for (size_t i = 0; i < (num_chunks - 1); i++) {
+          size_t next_chunk = (remaining < chunk_size) ? remaining : chunk_size;
+
+          // Wait for current load to complete
+          idma_hw_wait_all(0);
+
+          // Start loading next chunk into alternate buffer via ch0
+          dma_1dm(0, ptr_in, inp_buff[pp_swap ^ 1], sizeof(int8_t) * next_chunk);
+
+          // Process current chunk (ch0 loads next in parallel)
+          dequantize_asym8s_f32(out_buff[pp_swap], inp_buff[pp_swap], (float)scale, (int)zero_point, (int)current_chunk);
+
+          // Wait for previous store to complete before reusing out_buff
+          idma_hw_wait_all(1);
+
+          // Store result via ch1
+          dma_1dm(1, out_buff[pp_swap], ptr_out, FLT32_SIZE * current_chunk);
+
+          ptr_in += next_chunk;
+          ptr_out += current_chunk;
+          remaining -= next_chunk;
+          current_chunk = next_chunk;
+          pp_swap ^= 1;
+        }
+
+        // Process last chunk
+        idma_hw_wait_all(0);
+        dequantize_asym8s_f32(out_buff[pp_swap], inp_buff[pp_swap], (float)scale, (int)zero_point, (int)current_chunk);
+
+        idma_hw_wait_all(1);
+        dma_1dm(1, out_buff[pp_swap], ptr_out, FLT32_SIZE * current_chunk);
+        idma_hw_wait_all(1);
+
+        // Invalidate output cache: DMA wrote to system memory, cache may have stale data
+        xthal_dcache_region_invalidate(out_data, FLT32_SIZE * numel);
+        
+      } 
+      else if (ping_process_pong) {
+        // Simple sequential processing
+        size_t remaining = numel;
+        int8_t* ptr_in = (int8_t*)ptr_inp;
+        float32_t* ptr_out = out_data;
+
+        while (remaining > 0) {
+          size_t current_chunk = (remaining < chunk_size) ? remaining : chunk_size;
+
+          // Start load via ch0 (overlaps with any pending ch1 store)
+          dma_1dm(0, ptr_in, inp_buff[0], sizeof(int8_t) * current_chunk);
+          // Wait for previous store to complete (out_buff[0] safe to write)
+          idma_hw_wait_all(1);
+          // Wait for load to complete
+          idma_hw_wait_all(0);
+
+          // Process
+          dequantize_asym8s_f32(out_buff[0], inp_buff[0], (float)scale, (int)zero_point, (int)current_chunk);
+
+          // Store via ch1
+          dma_1dm(1, out_buff[0], ptr_out, FLT32_SIZE * current_chunk);
+
+          ptr_in += current_chunk;
+          ptr_out += current_chunk;
+          remaining -= current_chunk;
+        }
+        idma_hw_wait_all(1);
+
+        // Invalidate output cache: DMA wrote to system memory, cache may have stale data
+        xthal_dcache_region_invalidate(out_data, FLT32_SIZE * numel);
+        
+      }
+      
+    } else {
+      // No DMA: use hardware function on full tensor at once
+      // Writeback+invalidate input: ensures CPU-dirty data reaches system memory,
+      // then invalidate forces re-read from system memory (fresh data)
+      xthal_dcache_region_writeback((void*)input_data, sizeof(int8_t) * numel);
+      xthal_dcache_region_invalidate((void*)input_data, sizeof(int8_t) * numel);
+      dequantize_asym8s_f32(out_data, input_data, (float)scale, (int)zero_point, (int)numel);
+
+      // Writeback output from cache to system memory for DMA coherency
+      xthal_dcache_region_writeback(out_data, sizeof(float) * numel);
+
+    }
   } else if (
       input.scalar_type() == ScalarType::Bits16 ||
       input.scalar_type() == ScalarType::UInt16) {
     const uint16_t* input_data = input.const_data_ptr<uint16_t>();
-    kernels::dequantize<uint16_t>(
-        out_data, input_data, scale, zero_point, numel);
+    dequantize<uint16_t>(out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Short) {
     const int16_t* input_data = input.const_data_ptr<int16_t>();
-    kernels::dequantize<int16_t>(
-        out_data, input_data, scale, zero_point, numel);
+    dequantize<int16_t>(out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Int) {
     const int32_t* input_data = input.const_data_ptr<int32_t>();
-    kernels::dequantize<int32_t>(
-        out_data, input_data, scale, zero_point, numel);
+    dequantize<int32_t>(out_data, input_data, scale, zero_point, numel);
   } else {
     ET_CHECK_MSG(
         false,
         "Unhandled input dtype %hhd",
         static_cast<int8_t>(input.scalar_type()));
   }
+  return out;
+}
+
+// int8 dequantization - uses generic template
+Tensor& dequantize_per_tensor_asym8s_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  float* out_data = out.mutable_data_ptr<float>();
+  size_t numel = out.numel();
+  const int8_t* input_data = input.const_data_ptr<int8_t>();
+  dequantize<int8_t>(out_data, input_data, scale, zero_point, numel);
+  return out;
+}
+
+// uint8 dequantization - uses generic template
+Tensor& dequantize_per_tensor_asym8u_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  float* out_data = out.mutable_data_ptr<float>();
+  size_t numel = out.numel();
+  const uint8_t* input_data = input.const_data_ptr<uint8_t>();
+  dequantize<uint8_t>(out_data, input_data, scale, zero_point, numel);
+  return out;
+}
+
+// int16 dequantization - uses generic template
+Tensor& dequantize_per_tensor_asym16s_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  float* out_data = out.mutable_data_ptr<float>();
+  size_t numel = out.numel();
+  const int16_t* input_data = input.const_data_ptr<int16_t>();
+  dequantize<int16_t>(out_data, input_data, scale, zero_point, numel);
+  return out;
+}
+
+// uint16 dequantization - uses generic template
+Tensor& dequantize_per_tensor_asym16u_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  float* out_data = out.mutable_data_ptr<float>();
+  size_t numel = out.numel();
+  const uint16_t* input_data = input.const_data_ptr<uint16_t>();
+  dequantize<uint16_t>(out_data, input_data, scale, zero_point, numel);
+  return out;
+}
+
+// int32 dequantization - uses generic template
+Tensor& dequantize_per_tensor_asym32s_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  float* out_data = out.mutable_data_ptr<float>();
+  size_t numel = out.numel();
+  const int32_t* input_data = input.const_data_ptr<int32_t>();
+  dequantize<int32_t>(out_data, input_data, scale, zero_point, numel);
+  return out;
 }
 
-}; // namespace native
-}; // namespace vision
-}; // namespace impl
+} // namespace native
+} // namespace vision
+} // namespace impl
diff --git a/backends/cadence/vision/operators/op_max_pool2d_with_indices.cpp b/backends/cadence/vision/operators/op_max_pool2d_with_indices.cpp
new file mode 100644
index 00000000000..e1e468e1e33
--- /dev/null
+++ b/backends/cadence/vision/operators/op_max_pool2d_with_indices.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <lib.h>
+#include <cstring>
+#include <tuple>
+
+#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/backends/cadence/vision/operators/layer_configs.h>
+
+/* DMA-tiled and no-DMA maxpool executors (defined in maxpool_exec_mxnj2.c) */
+extern "C" {
+typedef int XAI_ERR_TYPE;
+XAI_ERR_TYPE maxpool_exec_mxnj2(
+    float* src, float* dst, const maxpool_layer_config_t* config);
+XAI_ERR_TYPE maxpool_exec_mxnj2_no_dma(
+    float* src, float* dst, const maxpool_layer_config_t* config);
+}
+
+using executorch::aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::IntArrayRef;
+using executorch::runtime::KernelRuntimeContext;
+using torch::executor::Error;
+
+namespace impl {
+namespace vision {
+namespace native {
+
+std::tuple<Tensor&, Tensor&> max_pool2d_with_indices_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool ceil_mode,
+    Tensor& out,
+    Tensor& indices) {
+
+  std::tuple<Tensor&, Tensor&> ret_val(out, indices);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::check_max_pool2d_with_indices_args(
+          in, kernel_size, stride, padding, dilation, ceil_mode, out, indices),
+      InvalidArgument,
+      ret_val);
+
+  size_t output_ndim = 0;
+  executorch::aten::SizesType output_sizes[executorch::runtime::kTensorDimensionLimit];
+  torch::executor::get_max_pool2d_with_indices_out_target_size(
+      in,
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      ceil_mode,
+      output_sizes,
+      &output_ndim);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::output_size_is_valid({output_sizes, output_ndim}, 2),
+      InvalidArgument,
+      ret_val);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::resize_tensor(out, {output_sizes, output_ndim}) == Error::Ok,
+      InvalidArgument,
+      ret_val);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::resize_tensor(indices, {output_sizes, output_ndim}) == Error::Ok,
+      InvalidArgument,
+      ret_val);
+
+  // ── HW-optimized path: stride == 2 ──────────────────────────────
+  if (stride[0] == 2 && stride[1] == 2) {
+    float32_t *ptr_out = (float32_t *) out.const_data_ptr<float>();
+    const float32_t *ptr_inp = (float32_t *) in.const_data_ptr<float>();
+    int batch = in.size(0);
+    int channels = in.size(1);
+    int inp_height = in.size(2); int inp_width = in.size(3);
+    int out_height = out.size(2); int out_width = out.size(3);
+    uint8_t kernel_height = kernel_size[0];
+    uint8_t kernel_width = kernel_size[1];
+
+    // Writeback input from cache to system memory: previous op may have written
+    // via CPU/cache, and maxpool's DMA kernel reads from system memory.
+    xthal_dcache_region_writeback((void*)ptr_inp, sizeof(float) * in.numel());
+
+    // Look up pre-computed config for this layer
+    const maxpool_layer_config_t* mp_cfg = get_maxpool_config_by_params(
+        channels, inp_height, inp_width,
+        kernel_height, kernel_width,
+        stride[0], stride[1],
+        padding[0], padding[1]);
+
+    if (mp_cfg != NULL) {
+      // Check if DRAM buffers are available for DMA tiling
+      bool dram_available = (ptr_dram0 != nullptr) && (IDMA_BUFFER_SIZE_DRAM0 > 0)
+                         && (ptr_dram1 != nullptr) && (IDMA_BUFFER_SIZE_DRAM1 > 0);
+
+      for (int b = 0; b < batch; b++) {
+        float* batch_inp = (float*)ptr_inp + b * channels * inp_height * inp_width;
+        float* batch_out = (float*)ptr_out + b * channels * out_height * out_width;
+
+        XAI_ERR_TYPE status;
+        if (dram_available) {
+          status = maxpool_exec_mxnj2(batch_inp, batch_out, mp_cfg);
+        } else {
+          status = maxpool_exec_mxnj2_no_dma(batch_inp, batch_out, mp_cfg);
+        }
+        ET_KERNEL_CHECK(ctx, status == 0, InvalidArgument, ret_val);
+      }
+
+      // Invalidate output cache: executor wrote to system memory
+      xthal_dcache_region_invalidate(ptr_out, sizeof(float) * out.numel());
+
+      return ret_val;
+    }
+  }
+
+  // ── Generic fallback: stride != 2 or no config found ──────────────
+  ScalarType in_type = in.scalar_type();
+  ET_SWITCH_REALHBF16_TYPES(
+      in_type, ctx, "max_pool2d_with_indices.out", CTYPE, [&]() {
+      torch::executor::apply_kernel_2d_reduce_then_map_fn<CTYPE>(
+            [](const CTYPE in_val,
+               const int64_t in_idx,
+               const CTYPE accum,
+               const int64_t accum_idx) {
+              if (in_val > accum) {
+                return std::tuple<CTYPE, int64_t>(in_val, in_idx);
+              }
+              return std::tuple<CTYPE, int64_t>(accum, accum_idx);
+            },
+            // Max pooling does not need to post-process the accumulated output
+            [](const int64_t count, const CTYPE accum) { return accum; },
+            /*include_pad=*/false,
+            in,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            out,
+            {indices});
+      });
+
+
+  return ret_val;
+}
+
+} // namespace native
+} // namespace vision
+} // namespace impl
diff --git a/backends/cadence/vision/operators/op_mean.cpp b/backends/cadence/vision/operators/op_mean.cpp
new file mode 100644
index 00000000000..13f946109ba
--- /dev/null
+++ b/backends/cadence/vision/operators/op_mean.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <lib.h>
+#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
+#include <executorch/kernels/portable/cpu/util/reduce_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+/* DMA-tiled mean executor (defined in mean/mean_exec_dma.c) */
+extern "C" {
+typedef int XAI_ERR_TYPE;
+XAI_ERR_TYPE mean_exec_dma(
+    const float* src, float* dst,
+    int channels, int spatial_h, int spatial_w);
+}
+
+using executorch::aten::RuntimeContext;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::ArrayRef;
+using executorch::runtime::KernelRuntimeContext;
+using torch::executor::Error;
+using torch::executor::optional;
+
+namespace impl {
+namespace vision {
+namespace native {
+
+// Forward declaration of hardware-optimized mean function
+extern "C" void simd_mean_pool_2x2_to_1x1_float32(
+    float32_t* restrict output,
+    const float32_t* restrict input,
+    int N);
+
+Tensor& mean_out(
+    RuntimeContext& ctx,
+    const Tensor& in,
+    optional<ArrayRef<int64_t>> dim_list,
+    bool keepdim,
+    optional<ScalarType> dtype,
+    Tensor& out) {
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::check_mean_dim_args(in, dim_list, keepdim, dtype, out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::resize_reduction_out(in, dim_list, keepdim, out) ==
+          Error::Ok,
+      InvalidArgument,
+      out);
+
+  constexpr auto name = "mean.out";
+
+  // Check if we can use hardware-optimized path
+  // Requires: float32, specific reduction pattern (2x2 spatial to 1x1)
+  bool optimized = false;
+  
+  if (in.scalar_type() == ScalarType::Float && 
+      out.scalar_type() == ScalarType::Float &&
+      dim_list.has_value()) {
+    
+    auto dims = dim_list.value();
+    int num_inp_dims = in.dim();
+    
+    // Check for 4D tensor with reduction on last 2 dimensions (H, W)
+    // Input: [N, C, H, W], reduce [H, W] -> [N, C, 1, 1]
+    if (num_inp_dims == 4 && dims.size() == 2) {
+      // Normalize negative dimensions
+      int64_t dim0 = dims[0] < 0 ? dims[0] + num_inp_dims : dims[0];
+      int64_t dim1 = dims[1] < 0 ? dims[1] + num_inp_dims : dims[1];
+      
+      // Check if reducing dimensions 2 and 3 (H and W in NCHW format)
+      if ((dim0 == 2 && dim1 == 3) || (dim0 == 3 && dim1 == 2)) {
+        // Check if spatial dimensions are 2x2
+        if (in.size(2) == 2 && in.size(3) == 2) {
+          optimized = true;
+        }
+      }
+    }
+  }
+
+  if (optimized) {
+    
+    const float* input_data = in.const_data_ptr<float>();
+    float* out_data = out.mutable_data_ptr<float>();
+    
+    int batch_size = in.size(0);
+    int channels = in.size(1);
+    int total_channels = batch_size * channels;
+
+    // Check if DRAM buffers are available for DMA
+    bool dram0_available = (ptr_dram0 != nullptr) && (IDMA_BUFFER_SIZE_DRAM0 > 0);
+    bool dram1_available = (ptr_dram1 != nullptr) && (IDMA_BUFFER_SIZE_DRAM1 > 0);
+    
+    size_t inp_bytes = total_channels * 4 * FLT32_SIZE;  // 4 floats per channel
+    size_t out_bytes = total_channels * FLT32_SIZE;       // 1 float per channel
+    
+    // Use DMA ping-pong tiled executor when both DRAM banks are available
+    bool use_dma = dram0_available && dram1_available;
+    
+    if (use_dma) {
+      // Writeback input from cache to system memory before DMA reads
+      xthal_dcache_region_writeback((void*)input_data, sizeof(float) * in.numel());
+
+      // Process each batch independently through the DMA executor
+      for (int b = 0; b < batch_size; b++) {
+        const float* batch_inp = input_data + b * channels * 4;
+        float* batch_out = out_data + b * channels;
+
+        XAI_ERR_TYPE status = mean_exec_dma(batch_inp, batch_out, channels, 2, 2);
+        if (status != 0) {
+          // DMA executor failed (buffer too small?), fall through to SIMD path
+          use_dma = false;
+          break;
+        }
+      }
+
+      if (use_dma) {
+        // Invalidate output cache: DMA wrote to system memory
+        xthal_dcache_region_invalidate(out_data, sizeof(float) * out.numel());
+        
+        return out;
+      }
+    }
+    
+    // Fallback: Direct SIMD without DMA (data fits or no DRAM)
+    // Writeback+invalidate input: ensures CPU-dirty data reaches system memory,
+    // then invalidate forces re-read from system memory (fresh data)
+    xthal_dcache_region_writeback((void*)input_data, sizeof(float) * in.numel());
+    xthal_dcache_region_invalidate((void*)input_data, sizeof(float) * in.numel());
+    simd_mean_pool_2x2_to_1x1_float32(out_data, input_data, total_channels * 4);
+    xthal_dcache_region_writeback(out_data, sizeof(float) * out.numel());
+
+    return out;
+  }
+
+  // Fallback to portable implementation
+  ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] {
+    ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] {
+      CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
+      const size_t num = torch::executor::get_reduced_dim_product(in, dim_list);
+
+      for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
+        CTYPE_OUT sum = 0;
+        if (in.numel() > 0) {
+          sum = torch::executor::map_reduce_over_dim_list<CTYPE_IN, CTYPE_OUT>(
+              [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
+              [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+              in,
+              dim_list,
+              out_ix);
+        }
+        out_data[out_ix] = sum / static_cast<float>(num);
+      }
+    });
+  });
+
+
+  return out;
+}
+
+Tensor& mean_dim_out(
+    RuntimeContext& ctx,
+    const Tensor& in,
+    optional<ArrayRef<int64_t>> dim_list,
+    bool keepdim,
+    optional<ScalarType> dtype,
+    Tensor& out) {
+  return mean_out(ctx, in, dim_list, keepdim, dtype, out);
+}
+
+} // namespace native
+} // namespace vision
+} // namespace impl
diff --git a/backends/cadence/vision/operators/op_quantize_per_tensor.cpp b/backends/cadence/vision/operators/op_quantize_per_tensor.cpp
index cd72d2de2b5..ceeafb98c70 100644
--- a/backends/cadence/vision/operators/op_quantize_per_tensor.cpp
+++ b/backends/cadence/vision/operators/op_quantize_per_tensor.cpp
@@ -1,66 +1,330 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/cadence/vision/kernels/kernels.h>
-#include <executorch/runtime/kernel/kernel_includes.h>
-
-namespace impl {
-namespace vision {
-namespace native {
-
-using executorch::aten::ScalarType;
-using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
-
-// Quantize the input tensor (PT2 version). Note that quant_<min,max> are not
-// used in any computation.
-void quantize_per_tensor_out(
-    KernelRuntimeContext& context,
-    const Tensor& input,
-    double scale,
-    int64_t zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    Tensor& out) {
-  const float* input_data = input.const_data_ptr<float>();
-  size_t numel = out.numel();
-
-  if (out.scalar_type() == ScalarType::Byte) {
-    uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
-    kernels::quantize<uint8_t>(
-        out_data, input_data, 1. / scale, zero_point, numel);
-  } else if (out.scalar_type() == ScalarType::Char) {
-    int8_t* out_data = out.mutable_data_ptr<int8_t>();
-    kernels::quantize<int8_t>(
-        out_data, input_data, 1. / scale, zero_point, numel);
-  } else if (
-      out.scalar_type() == ScalarType::Bits16 ||
-      out.scalar_type() == ScalarType::UInt16) {
-    uint16_t* out_data = out.mutable_data_ptr<uint16_t>();
-    kernels::quantize<uint16_t>(
-        out_data, input_data, 1. / scale, zero_point, numel);
-  } else if (out.scalar_type() == ScalarType::Short) {
-    int16_t* out_data = out.mutable_data_ptr<int16_t>();
-    kernels::quantize<int16_t>(
-        out_data, input_data, 1. / scale, zero_point, numel);
-  } else if (out.scalar_type() == ScalarType::Int) {
-    int32_t* out_data = out.mutable_data_ptr<int32_t>();
-    kernels::quantize<int32_t>(
-        out_data, input_data, 1. / scale, zero_point, numel);
-  } else {
-    ET_CHECK_MSG(
-        false,
-        "Unhandled input dtype %hhd",
-        static_cast<int8_t>(out.scalar_type()));
-  }
-}
-
-}; // namespace native
-}; // namespace vision
-}; // namespace impl
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <lib.h>
+#include <cstdio>
+#include <executorch/backends/cadence/generic/kernels/kernels.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
+using ::impl::generic::kernels::quantize;
+
+namespace impl {
+namespace vision {
+namespace native {
+
+// Forward declaration of hardware-optimized quantize function
+extern "C" void quantize_f32_asym8s(
+    int8_t* restrict ptr_out,
+    const float32_t* restrict ptr_inp,
+    float32_t scale,
+    int zero_bias,
+    int N);
+
+// Quantize the input tensor (PT2 version). Note that quant_<min,max> are not
+// used in any computation.
+Tensor& quantize_per_tensor_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  const float* input_data = input.const_data_ptr<float>();
+  size_t numel = out.numel();
+ 
+  if (out.scalar_type() == ScalarType::Byte) {
+    uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
+    quantize<uint8_t>(out_data, input_data, 1. / scale, zero_point, numel);
+  } else if (out.scalar_type() == ScalarType::Char) {
+    
+    int8_t* out_data = out.mutable_data_ptr<int8_t>();
+    
+    // Hardware-optimized int8 quantization with DMA support
+    bool ping_pong_process = false;
+    bool ping_process_pong = false;
+    size_t chunk_size = 0;
+
+    float32_t* inp_buff[2];
+    int8_t* out_buff[2];
+
+    // Check if DRAM buffers are available
+    bool dram0_available = (ptr_dram0 != nullptr) && (IDMA_BUFFER_SIZE_DRAM0 > 0);
+    bool dram1_available = (ptr_dram1 != nullptr) && (IDMA_BUFFER_SIZE_DRAM1 > 0);
+    
+    // DMA has overhead - only beneficial for larger tensors
+    // Threshold: 1024 elements (~4KB for float32, ~1KB for int8)
+    const size_t DMA_THRESHOLD = 1024;
+    bool use_dma = (numel >= DMA_THRESHOLD);
+
+    // Strategy 1: Try ping-pong processing (2 input + 2 output buffers)
+    // Using 80/20 split: 80% for input, 20% for output in each DRAM
+    if (use_dma && dram0_available && dram1_available && (numel >= 2)) {
+      size_t inp_per_buffer = (IDMA_BUFFER_SIZE_DRAM0 * 4) / (5 * FLT32_SIZE);  // 80% for float32 input
+      size_t out_per_buffer_dram0 = (IDMA_BUFFER_SIZE_DRAM0 * 1) / 5;  // 20% for int8 output
+      size_t out_per_buffer_dram1 = (IDMA_BUFFER_SIZE_DRAM1 * 1) / 5;  // 20% for int8 output
+
+      // Check if 80/20 split fits in both DRAMs
+      if ((inp_per_buffer > 0) && 
+          (out_per_buffer_dram0 >= inp_per_buffer) &&
+          (out_per_buffer_dram1 >= inp_per_buffer) &&
+          ((IDMA_BUFFER_SIZE_DRAM0 * 4) / 5 + IDMA_BUFFER_SIZE_DRAM0 / 5 <= IDMA_BUFFER_SIZE_DRAM0) &&
+          ((IDMA_BUFFER_SIZE_DRAM1 * 4) / 5 + IDMA_BUFFER_SIZE_DRAM1 / 5 <= IDMA_BUFFER_SIZE_DRAM1)) {
+        
+        // Allocate buffers with 80/20 split
+        inp_buff[0] = (float32_t*)ptr_dram0;
+        out_buff[0] = (int8_t*)((uint8_t*)ptr_dram0 + (IDMA_BUFFER_SIZE_DRAM0 * 4) / 5);
+        
+        inp_buff[1] = (float32_t*)ptr_dram1;
+        out_buff[1] = (int8_t*)((uint8_t*)ptr_dram1 + (IDMA_BUFFER_SIZE_DRAM1 * 4) / 5);
+        
+        chunk_size = inp_per_buffer;
+        ping_pong_process = true;
+      }
+    }
+    
+    // Strategy 2: Fallback to ping-process-pong (1 input + 1 output buffer)
+    // Use full DRAM0 for input, full DRAM1 for output (no split needed)
+    if (use_dma && !ping_pong_process && dram0_available && dram1_available) {
+      size_t inp_capacity = IDMA_BUFFER_SIZE_DRAM0 / FLT32_SIZE;  // Full DRAM0 for input
+      size_t out_capacity = IDMA_BUFFER_SIZE_DRAM1;  // Full DRAM1 for output
+      
+      if ((inp_capacity > 0) && (out_capacity >= inp_capacity)) {
+        inp_buff[0] = (float32_t*)ptr_dram0;
+        out_buff[0] = (int8_t*)ptr_dram1;
+        
+        chunk_size = (inp_capacity < out_capacity) ? inp_capacity : out_capacity;
+        ping_process_pong = true;
+      }
+    }
+
+    if (ping_pong_process || ping_process_pong) {
+      const float32_t* ptr_inp = (float32_t*)input_data;
+
+      // Writeback input from cache to system memory before DMA reads
+      xthal_dcache_region_writeback((void*)input_data, FLT32_SIZE * numel);
+
+      /* Initialize DMA Channel 0 (loads) and Channel 1 (stores) */
+      dma_2dm_init(0);
+      dma_2dm_init(1);
+
+      if (ping_pong_process) {
+        // Ping-pong processing for better throughput
+        size_t num_chunks = (numel + chunk_size - 1) / chunk_size;
+        
+        if (num_chunks == 0) num_chunks = 1;
+
+        int32_t pp_swap = 0;
+
+        float32_t* ptr_in = (float32_t*)ptr_inp;
+        int8_t* ptr_out = out_data;
+
+        // Load first chunk via ch0
+        size_t current_chunk = (numel < chunk_size) ? numel : chunk_size;
+
+        dma_1dm(0, ptr_in, inp_buff[pp_swap], FLT32_SIZE * current_chunk);
+
+        size_t remaining = numel - current_chunk;
+        ptr_in += current_chunk;
+
+        // Pipeline: load (ch0) and store (ch1) overlap with processing
+        for (size_t i = 0; i < (num_chunks - 1); i++) {
+          size_t next_chunk = (remaining < chunk_size) ? remaining : chunk_size;
+
+          // Wait for current load to complete
+          idma_hw_wait_all(0);
+
+          // Start loading next chunk into alternate buffer via ch0
+          dma_1dm(0, ptr_in, inp_buff[pp_swap ^ 1], FLT32_SIZE * next_chunk);
+
+          // Process current chunk (ch0 loads next in parallel)
+          quantize_f32_asym8s(out_buff[pp_swap], inp_buff[pp_swap], (float)scale, (int)zero_point, (int)current_chunk);
+
+          // Wait for previous store to complete before reusing out_buff
+          idma_hw_wait_all(1);
+
+          // Store result via ch1
+          dma_1dm(1, out_buff[pp_swap], ptr_out, sizeof(int8_t) * current_chunk);
+
+          ptr_in += next_chunk;
+          ptr_out += current_chunk;
+          remaining -= next_chunk;
+          current_chunk = next_chunk;
+          pp_swap ^= 1;
+        }
+
+        // Process last chunk
+        idma_hw_wait_all(0);
+        quantize_f32_asym8s(out_buff[pp_swap], inp_buff[pp_swap], (float)scale, (int)zero_point, (int)current_chunk);
+
+        idma_hw_wait_all(1);
+        dma_1dm(1, out_buff[pp_swap], ptr_out, sizeof(int8_t) * current_chunk);
+        idma_hw_wait_all(1);
+
+        // Invalidate output cache: DMA wrote to system memory, cache may have stale data
+        xthal_dcache_region_invalidate(out_data, sizeof(int8_t) * numel);
+        
+      } 
+      else if (ping_process_pong) {
+        // Simple sequential processing
+        size_t remaining = numel;
+        float32_t* ptr_in = (float32_t*)ptr_inp;
+        int8_t* ptr_out = out_data;
+
+        while (remaining > 0) {
+          size_t current_chunk = (remaining < chunk_size) ? remaining : chunk_size;
+
+          // Start load via ch0 (overlaps with any pending ch1 store)
+          dma_1dm(0, ptr_in, inp_buff[0], FLT32_SIZE * current_chunk);
+          // Wait for previous store to complete (out_buff[0] safe to write)
+          idma_hw_wait_all(1);
+          // Wait for load to complete
+          idma_hw_wait_all(0);
+
+          // Process
+          quantize_f32_asym8s(out_buff[0], inp_buff[0], (float)scale, (int)zero_point, (int)current_chunk);
+
+          // Store via ch1
+          dma_1dm(1, out_buff[0], ptr_out, sizeof(int8_t) * current_chunk);
+
+          ptr_in += current_chunk;
+          ptr_out += current_chunk;
+          remaining -= current_chunk;
+        }
+        idma_hw_wait_all(1);
+
+        // Invalidate output cache: DMA wrote to system memory, cache may have stale data
+        xthal_dcache_region_invalidate(out_data, sizeof(int8_t) * numel);
+        
+      }
+      
+    } else {
+      // No DMA: use hardware function on full tensor at once
+      // Writeback+invalidate input: ensures CPU-dirty data reaches system memory,
+      // then invalidate forces re-read from system memory (fresh data)
+      xthal_dcache_region_writeback((void*)input_data, FLT32_SIZE * numel);
+      xthal_dcache_region_invalidate((void*)input_data, FLT32_SIZE * numel);
+      quantize_f32_asym8s(out_data, input_data, (float)scale, (int)zero_point, (int)numel);
+
+      // Writeback output from cache to system memory for DMA coherency
+      xthal_dcache_region_writeback(out_data, sizeof(int8_t) * numel);
+
+    }
+
+  } else if (
+      out.scalar_type() == ScalarType::Bits16 ||
+      out.scalar_type() == ScalarType::UInt16) {
+    uint16_t* out_data = out.mutable_data_ptr<uint16_t>();
+    quantize<uint16_t>(out_data, input_data, 1. / scale, zero_point, numel);
+  } else if (out.scalar_type() == ScalarType::Short) {
+    int16_t* out_data = out.mutable_data_ptr<int16_t>();
+    quantize<int16_t>(out_data, input_data, 1. / scale, zero_point, numel);
+  } else if (out.scalar_type() == ScalarType::Int) {
+    int32_t* out_data = out.mutable_data_ptr<int32_t>();
+    quantize<int32_t>(out_data, input_data, 1. / scale, zero_point, numel);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(out.scalar_type()));
+  }
+  return out;
+}
+
+// int8 quantization - uses generic template
+Tensor& quantize_per_tensor_asym8s_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  const float* input_data = input.const_data_ptr<float>();
+  size_t numel = out.numel();
+  int8_t* out_data = out.mutable_data_ptr<int8_t>();
+  quantize<int8_t>(out_data, input_data, 1. / scale, zero_point, numel);
+  return out;
+}
+
+// uint8 quantization - uses generic template
+Tensor& quantize_per_tensor_asym8u_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  const float* input_data = input.const_data_ptr<float>();
+  size_t numel = out.numel();
+  uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
+  quantize<uint8_t>(out_data, input_data, 1. / scale, zero_point, numel);
+  return out;
+}
+
+// int16 quantization - uses generic template
+Tensor& quantize_per_tensor_asym16s_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  const float* input_data = input.const_data_ptr<float>();
+  size_t numel = out.numel();
+  int16_t* out_data = out.mutable_data_ptr<int16_t>();
+  quantize<int16_t>(out_data, input_data, 1. / scale, zero_point, numel);
+  return out;
+}
+
+// uint16 quantization - uses generic template
+Tensor& quantize_per_tensor_asym16u_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  const float* input_data = input.const_data_ptr<float>();
+  size_t numel = out.numel();
+  uint16_t* out_data = out.mutable_data_ptr<uint16_t>();
+  quantize<uint16_t>(out_data, input_data, 1. / scale, zero_point, numel);
+  return out;
+}
+
+// int32 quantization - uses generic template
+Tensor& quantize_per_tensor_asym32s_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  const float* input_data = input.const_data_ptr<float>();
+  size_t numel = out.numel();
+  int32_t* out_data = out.mutable_data_ptr<int32_t>();
+  quantize<int32_t>(out_data, input_data, 1. / scale, zero_point, numel);
+  return out;
+}
+
+} // namespace native
+} // namespace vision
+} // namespace impl
diff --git a/backends/cadence/vision/operators/op_quantized_conv_out.cpp b/backends/cadence/vision/operators/op_quantized_conv_out.cpp
index be4b34bff03..0d47331c367 100644
--- a/backends/cadence/vision/operators/op_quantized_conv_out.cpp
+++ b/backends/cadence/vision/operators/op_quantized_conv_out.cpp
@@ -6,8 +6,25 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/vision/kernels/kernels.h>
+#include <lib.h>
+#include <stdio.h>
+#include <executorch/backends/cadence/generic/kernels/kernels.h>
+#include <executorch/backends/cadence/generic/operators/cadence_type_util.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/backends/cadence/vision/operators/operators.h>
+#include <executorch/backends/cadence/vision/operators/layer_configs.h>
+
+// Forward declaration of conv_execute_kernel (defined in conv_kernel_dispatcher.c)
+extern "C" {
+typedef int XAI_ERR_TYPE;
+XAI_ERR_TYPE conv_execute_kernel(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* coeff_ptr,
+    int8_t* bias_ptr,
+    const conv_layer_config_t* config);
+}
 
 namespace impl {
 namespace vision {
@@ -141,7 +158,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
             if (quantized) {
               float val = bias_scale * acc;
               out_plane[_oh * ow + _ow] =
-                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
+                  ::impl::generic::kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
             } else {
               out_plane[_oh * ow + _ow] = acc;
             }
@@ -267,7 +284,7 @@ __attribute__((noinline)) void conv2d_nhwc_core_generic(
             if (quantized) {
               float val = bias_scale * acc;
               out_line[_oc] =
-                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
+                  ::impl::generic::kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
             } else {
               out_line[_oc] = acc;
             }
@@ -296,6 +313,7 @@ void quantized_conv_nchw(
     float output_scale,
     int32_t output_zero_point,
     Tensor& out) {
+
   bool conv1d = input.dim() == 3;
   // input = [n, c, h, w]
   const int n = input.size(0);
@@ -344,13 +362,232 @@ void quantized_conv_nchw(
   }
   ScalarType dtype = out.scalar_type();
   switch (dtype) {
-    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv2d_nchw);
+    case ScalarType::Char: {
+      const conv_layer_config_t* config_const = get_layer_config_by_params(
+          c, h, w,              // ic, ih, iw
+          oc, wh, ww,           // oc, kh, kw
+          oh, ow,               // oh, ow
+          stride[0], stride[1], // sy, sx
+          padding[0], dilation[0]); // pad, dil
+
+      // Make a mutable local copy — the static const table may reside in
+      // read-only memory (.rodata), so writing through const_cast is undefined
+      // behavior and silently fails on Xtensa targets.
+      conv_layer_config_t config_local;
+      conv_layer_config_t* config = NULL;
+      float effective_scale = 0.0f;
+      if (config_const != NULL) {
+        config_local = *config_const;  // shallow copy all fields
+        config = &config_local;
+
+        // DMA path for all layers ≥ 4×4 spatial; generic C fallback for ≤ 2×2.
+        //
+        // XAI kernel pipeline: out = (acc >> accumShift) * outputScale >> outputShift
+        // The kernel saturates the shifted accumulator to int16 [-32768, 32767]
+        // after accumShift, so accumShift must be chosen to keep accumulators in range.
+        effective_scale = bias_scale / output_scale;
+      }
+
+      if(config != NULL) {
+        config->input_zero_point = static_cast<int>(in_zero_point);
+
+        // Disable in-kernel ReLU — ExecuTorch applies ReLU as a separate op.
+        config->relu_min = -128;
+        config->relu_max = 127;
+
+        // Bias correction: absorb input_zero_point and output_zero_point
+        // into the kernel bias to avoid the double-clamp problem.
+        // Also clamp to 24-bit range (ACC_INIT_BIAS takes lower 24 bits);
+        // any residual beyond 24-bit is applied as post-kernel correction.
+        const int32_t* bias_orig = bias.const_data_ptr<int32_t>();
+        const int8_t*  wt_data   = weight.const_data_ptr<int8_t>();
+        const int       wt_per_oc = weight.numel() / oc;
+
+        static const int32_t BIAS_24BIT_MAX =  8388607;   // (1 << 23) - 1
+        static const int32_t BIAS_24BIT_MIN = -8388608;   // -(1 << 23)
+
+        // output_zero_point expressed in accumulator domain
+        int64_t zp_acc_corr = 0;
+        if (output_zero_point != 0 && effective_scale > 0.0f) {
+          double zp_d = static_cast<double>(output_zero_point) / effective_scale;
+          zp_acc_corr = static_cast<int64_t>(zp_d >= 0.0 ? zp_d + 0.5 : zp_d - 0.5);
+        }
+
+        // Per-channel split bias: kernel_bias (24-bit safe) + post_correction
+        int32_t kernel_bias[2048];
+        int32_t post_correction[2048];
+        int64_t max_abs_kernel_bias = 0;
+        for (int o = 0; o < oc; o++) {
+          int32_t w_sum = 0;
+          const int8_t* wt_oc = wt_data + o * wt_per_oc;
+          for (int i = 0; i < wt_per_oc; i++) {
+            w_sum += wt_oc[i];
+          }
+          int64_t bias_corr_64 = static_cast<int64_t>(bias_orig[o])
+                               - static_cast<int64_t>(in_zero_point) * w_sum;
+
+          int64_t target_bias = bias_corr_64 + zp_acc_corr;
+
+          int32_t kb;
+          if (target_bias > BIAS_24BIT_MAX) {
+            kb = BIAS_24BIT_MAX;
+          } else if (target_bias < BIAS_24BIT_MIN) {
+            kb = BIAS_24BIT_MIN;
+          } else {
+            kb = static_cast<int32_t>(target_bias);
+          }
+          kernel_bias[o] = kb;
+
+          int64_t abs_kb = kb >= 0 ? kb : -static_cast<int64_t>(kb);
+          if (abs_kb > max_abs_kernel_bias) max_abs_kernel_bias = abs_kb;
+
+          int64_t bias_residual = target_bias - kb;
+          float resid_float = static_cast<float>(bias_residual) * effective_scale;
+          int32_t resid_int = static_cast<int32_t>(resid_float >= 0.0f
+                            ? resid_float + 0.5f : resid_float - 0.5f);
+          post_correction[o] = resid_int;
+        }
+
+        // accumShift: ensure (acc >> accSh) fits in int16 after PACK.
+        // Tight bound from actual weight L1 norms instead of worst-case 128*128*P.
+        // max_acc = |bias| + sum(|weight_i|) * 128 since inputs are int8 (magnitude ≤ 128).
+        // Compute max sum(|weights|) across all output channels
+        int64_t max_weight_l1 = 0;
+        for (int o = 0; o < oc; o++) {
+          const int8_t* wt_oc = wt_data + o * wt_per_oc;
+          int64_t w_l1 = 0;
+          for (int i = 0; i < wt_per_oc; i++) {
+            w_l1 += (wt_oc[i] >= 0) ? wt_oc[i] : -wt_oc[i];
+          }
+          if (w_l1 > max_weight_l1) max_weight_l1 = w_l1;
+        }
+
+        // Tight max accumulator bound: bias + L1(weights) * max_input_magnitude
+        float max_acc = static_cast<float>(max_abs_kernel_bias)
+                      + static_cast<float>(max_weight_l1) * 128.0f;
+
+        int accum_shift = 0;
+        while (max_acc / static_cast<float>(1LL << accum_shift) > 32767.0f
+               && accum_shift < 31) {
+          accum_shift++;
+        }
+
+        config->accum_shift = accum_shift;
+
+        // outputShift & outputScale: maximize precision within uint16 range.
+        int best_shift = 15;
+        int64_t total_shift = static_cast<int64_t>(accum_shift) + best_shift;
+        int32_t raw_scale = static_cast<int32_t>(
+            effective_scale * static_cast<double>(1LL << total_shift));
+        if (raw_scale > 65535) {
+          // Scale too large for uint16_t, reduce outputShift until it fits
+          while (best_shift > 0 && raw_scale > 65535) {
+            best_shift--;
+            total_shift = static_cast<int64_t>(accum_shift) + best_shift;
+            raw_scale = static_cast<int32_t>(
+                effective_scale * static_cast<double>(1LL << total_shift));
+          }
+        } else if (raw_scale < 16384 && best_shift < 31) {
+          // Scale too small, increase outputShift for better precision
+          while (best_shift < 31) {
+            int64_t trial_total = static_cast<int64_t>(accum_shift) + best_shift + 1;
+            if (trial_total > 62) break;  // avoid 1LL << overflow
+            int32_t trial = static_cast<int32_t>(
+                effective_scale * static_cast<double>(1LL << trial_total));
+            if (trial > 65535) break;
+            best_shift++;
+            raw_scale = trial;
+          }
+        }
+        if (raw_scale <= 0) raw_scale = 1;
+        if (raw_scale > 65535) raw_scale = 65535;
+
+        config->output_shift = best_shift;
+        config->output_scale = raw_scale;
+
+        // CPU-computed kernel_bias resides only in cache;
+        // DMA bypasses cache and reads system memory, so writeback is needed.
+        xthal_dcache_region_writeback(
+            reinterpret_cast<int8_t*>(kernel_bias),
+            oc * sizeof(int32_t));
+
+        // Writeback input and weight from cache to system memory before DMA reads
+        xthal_dcache_region_writeback(
+            const_cast<int8_t*>(input.const_data_ptr<int8_t>()),
+            n * c * h * w * sizeof(int8_t));
+        xthal_dcache_region_writeback(
+            const_cast<int8_t*>(weight.const_data_ptr<int8_t>()),
+            weight.numel() * sizeof(int8_t));
+
+        XAI_ERR_TYPE kern_status = conv_execute_kernel(
+            const_cast<int8_t*>(input.const_data_ptr<int8_t>()),
+            out.mutable_data_ptr<int8_t>(),
+            const_cast<int8_t*>(weight.const_data_ptr<int8_t>()),
+            reinterpret_cast<int8_t*>(kernel_bias),
+            config);
+        (void)kern_status;
+
+        // Invalidate cache for DMA-written output so post-correction
+        // and next operator see fresh data instead of stale cache lines
+        xthal_dcache_region_invalidate(
+            out.mutable_data_ptr<int8_t>(),
+            n * oc * oh * ow * sizeof(int8_t));
+
+        // Apply post-kernel residual correction
+        bool has_correction = false;
+        for (int _n = 0; _n < n; _n++) {
+          for (int _oc = 0; _oc < oc; _oc++) {
+            int32_t corr = post_correction[_oc];
+            if (corr == 0) continue;
+            has_correction = true;
+            int8_t* ch_out = out.mutable_data_ptr<int8_t>() + (_n * oc * oh * ow + _oc * oh * ow);
+            for (int _s = 0; _s < oh * ow; _s++) {
+              int32_t val = static_cast<int32_t>(ch_out[_s]) + corr;
+              val = val < -128 ? -128 : (val > 127 ? 127 : val);
+              ch_out[_s] = static_cast<int8_t>(val);
+            }
+          }
+        }
+
+        // If post_correction modified output via CPU/cache, writeback to system memory
+        if (has_correction) {
+          xthal_dcache_region_writeback(
+              out.mutable_data_ptr<int8_t>(),
+              n * oc * oh * ow * sizeof(int8_t));
+        }
+
+        break;
+      }
+      // Fall through to generic implementation
+      conv2d_nchw_core_generic<int8_t, int8_t, int32_t, int8_t, true>(
+          input.const_data_ptr<int8_t>(),
+          weight.const_data_ptr<int8_t>(),
+          bias.const_data_ptr<int32_t>(),
+          out.mutable_data_ptr<int8_t>(),
+          n, c, h, w,
+          oc, wc, wh, ww,
+          oh, ow,
+          stride[0], stride[1],
+          padding[0], padding[1],
+          dilation[0], dilation[1],
+          groups,
+          in_zero_point,
+          weight_zero_point,
+          bias_scale,
+          output_scale,
+          (int8_t)output_zero_point);
+      break;
+    }
+    // Handle uint8_t (Byte) case - previously covered by ET_FORALL_CADENCE_QUANTIZED_TYPES
+    // Note: Char (int8_t) is handled explicitly above with optimized kernel
+    typed_quantized_conv2d_nchw(uint8_t, Byte);
     default:
       ET_DCHECK_MSG(
           false, "Unhandled dtype %s", torch::executor::toString(dtype));
   }
 
 #undef typed_quantized_conv2d_nchw
+
 }
 
 void quantized_conv_nhwc(
@@ -582,7 +819,6 @@ void quantized_conv2d_nhwc_per_tensor_out(
     int64_t output_zero_point,
     int64_t out_multiplier,
     int64_t out_shift,
-    ET_UNUSED const ::executorch::aten::optional<Tensor>& offset,
     Tensor& out) {
   quantized_conv_per_tensor_out(
       ctx,
@@ -678,6 +914,158 @@ void quantized_conv2d_nhwc_out(
       out);
 }
 
+void quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  quantized_conv_nchw(
+      input, weight, bias, stride, padding, dilation, groups,
+      in_zero_point, weight_zero_point, bias_scale, output_scale,
+      output_zero_point, out);
+}
+
 } // namespace native
 } // namespace vision
+
+// The codegen dispatches to impl::generic::native:: namespace.
+// Forward to the vision::native implementation.
+namespace generic {
+namespace native {
+
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+void quantized_conv2d_nhwc_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    Tensor& out) {
+  ::impl::vision::native::quantized_conv_per_tensor_out(
+      ctx, input, weight, bias, stride, padding, dilation, groups,
+      in_zero_point, weight_zero_point, bias_scale, output_scale,
+      output_zero_point, out_multiplier, out_shift,
+      true, // channel_last = true for NHWC
+      out);
+}
+
+void quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  ::impl::vision::native::quantized_conv_nchw(
+      input, weight, bias, stride, padding, dilation, groups,
+      in_zero_point, weight_zero_point, bias_scale, output_scale,
+      output_zero_point, out);
+}
+
+void quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  ::impl::vision::native::quantized_conv_nchw(
+      input, weight, bias, stride, padding, dilation, groups,
+      in_zero_point, weight_zero_point, bias_scale, output_scale,
+      output_zero_point, out);
+}
+
+void quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  ::impl::vision::native::quantized_conv_nchw(
+      input, weight, bias, stride, padding, dilation, groups,
+      in_zero_point, weight_zero_point, bias_scale, output_scale,
+      output_zero_point, out);
+}
+
+void quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  ::impl::vision::native::quantized_conv_nchw(
+      input, weight, bias, stride, padding, dilation, groups,
+      in_zero_point, weight_zero_point, bias_scale, output_scale,
+      output_zero_point, out);
+}
+
+} // namespace native
+} // namespace generic
 } // namespace impl
diff --git a/backends/cadence/vision/operators/op_quantized_linear_out.cpp b/backends/cadence/vision/operators/op_quantized_linear_out.cpp
index b6b7cdd17bc..7b579fb8d0d 100644
--- a/backends/cadence/vision/operators/op_quantized_linear_out.cpp
+++ b/backends/cadence/vision/operators/op_quantized_linear_out.cpp
@@ -6,18 +6,65 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/vision/operators/operators.h>
-#include <executorch/backends/cadence/vision/operators/quantized_ops.h>
+#include <api.h>
+#include <lib.h>
+#include <algorithm>
+#include <cmath>
+#include <executorch/backends/cadence/generic/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
 
 namespace impl {
 namespace vision {
 namespace native {
 
+using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::getLeadingDims;
 using executorch::runtime::KernelRuntimeContext;
 
+// Generic fallback implementation
+template <typename T>
+void quantized_linear_per_tensor_generic_(
+    const Tensor& src,
+    const Tensor& weight,
+    const Tensor& bias,
+    const int64_t src_zero_point,
+    const int64_t weight_zero_point,
+    const int64_t out_multiplier,
+    const int64_t out_shift,
+    const int64_t out_zero_point,
+    Tensor& out) {
+
+  const int64_t leading_dims = getLeadingDims(src, src.dim() - 1);
+  const int64_t out_dim = weight.size(0);
+  const int64_t in_dim = weight.size(1);
+
+  const T* __restrict__ in_data = src.const_data_ptr<T>();
+  const T* __restrict__ weight_data = weight.const_data_ptr<T>();
+  const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
+  T* __restrict__ out_data = out.mutable_data_ptr<T>();
+
+  // Compute the requant_scale from out_multiplier and out_shift
+  const float requant_scale =
+      -out_multiplier * 1.0 / (1 << 31) * pow(2, out_shift);
+
+  for (size_t i = 0; i < leading_dims; ++i) {
+    for (size_t j = 0; j < out_dim; ++j) {
+      int32_t sum = bias_data[j];
+      for (size_t k = 0; k < in_dim; ++k) {
+        int32_t x = (int32_t)in_data[i * in_dim + k] - src_zero_point;
+        int32_t w = (int32_t)weight_data[j * in_dim + k] - (int32_t)weight_zero_point;
+        sum += x * w;
+      }
+
+      out_data[i * out_dim + j] =
+          ::impl::generic::kernels::quantize<T>(sum, requant_scale, out_zero_point);
+    }
+  }
+}
+
+// Upstream-style quantized_linear_out with tensor-based zero points
 template <typename T>
 void inline _typed_quantized_linear(
     const Tensor& src,
@@ -36,15 +83,9 @@ void inline _typed_quantized_linear(
 
   int32_t weight_zero_point = weight_zero_point_t.const_data_ptr<int32_t>()[0];
 
-  // input comes in shape [batch_size, in_dim]
-  // weight comes in shape [out_dim, in_dim]
-  // output comes in empty with shape [batch_size, out_dim]
-  // Perform matrix multiply (M x N) x (N x P) => M x P
   const auto M = weight.size(0); // = out_dim
   const auto N = weight.size(1); // = in_dim
 
-  // Given an N-dimensional input [d0, d1, d2, ..., d_{N-2}, d_{N-1}], the
-  // leading dimensions is d0 * d1 * ... * d_{N-2}
   const auto leading_dims = getLeadingDims(src, src.dim() - 1);
 
   ET_CHECK_MSG(
@@ -69,7 +110,7 @@ void inline _typed_quantized_linear(
             (weight_data[j * N + k] - weight_zero_point);
       }
       out_data[i * M + j] =
-          kernels::quantize<T>(sum, out_scale, out_zero_point);
+          impl::generic::kernels::quantize<T>(sum, out_scale, out_zero_point);
     }
   }
 }
@@ -86,29 +127,14 @@ void quantized_linear_out(
     int64_t out_zero_point,
     __ET_UNUSED const executorch::aten::optional<Tensor>& offset,
     Tensor& out) {
-  // TODO: refactor to use switch case as quantized_linear_per_tensor_out
   if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
     _typed_quantized_linear<uint8_t>(
-        src,
-        weight,
-        bias,
-        src_zero_point,
-        weight_zero_point_t,
-        out_multiplier,
-        out_shift,
-        out_zero_point,
-        out);
+        src, weight, bias, src_zero_point, weight_zero_point_t,
+        out_multiplier, out_shift, out_zero_point, out);
   } else if (out.scalar_type() == executorch::aten::ScalarType::Char) {
     _typed_quantized_linear<int8_t>(
-        src,
-        weight,
-        bias,
-        src_zero_point,
-        weight_zero_point_t,
-        out_multiplier,
-        out_shift,
-        out_zero_point,
-        out);
+        src, weight, bias, src_zero_point, weight_zero_point_t,
+        out_multiplier, out_shift, out_zero_point, out);
   } else {
     ET_CHECK_MSG(
         false,
@@ -117,6 +143,7 @@ void quantized_linear_out(
   }
 }
 
+// Optimized quantized_linear_per_tensor_out with DMA and SIMD support
 void quantized_linear_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& src,
@@ -127,33 +154,168 @@ void quantized_linear_per_tensor_out(
     const int64_t out_multiplier,
     const int64_t out_shift,
     const int64_t out_zero_point,
-    __ET_UNUSED const executorch::aten::optional<Tensor>& offset,
+    __ET_UNUSED const std::optional<Tensor>& offset,
     Tensor& out) {
-#define typed_quantized_linear_per_tensor(ctype, dtype) \
-  case executorch::aten::ScalarType::dtype: {           \
-    quantized_linear_per_tensor_<ctype>(                \
-        src,                                            \
-        weight,                                         \
-        bias,                                           \
-        src_zero_point,                                 \
-        weight_zero_point,                              \
-        out_multiplier,                                 \
-        out_shift,                                      \
-        out_zero_point,                                 \
-        out);                                           \
-    break;                                              \
+
+
+  const int64_t leading_dims = getLeadingDims(src, src.dim() - 1);
+  const int64_t out_dim = weight.size(0);
+  const int64_t in_dim = weight.size(1);
+  const size_t numel = leading_dims * out_dim;
+
+  bool use_optimized = false;
+  if (src.scalar_type() == ScalarType::Char &&
+      weight.scalar_type() == ScalarType::Char &&
+      out.scalar_type() == ScalarType::Char &&
+      in_dim >= 16) {
+    use_optimized = true;
   }
 
-  executorch::aten::ScalarType dtype = out.scalar_type();
-  switch (dtype) {
-    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_linear_per_tensor);
-    default:
-      ET_DCHECK_MSG(
-          false, "Unhandled dtype %s", executorch::runtime::toString(dtype));
+  if (use_optimized) {
+    const int8_t* in_data = src.const_data_ptr<int8_t>();
+    const int8_t* weight_data = weight.const_data_ptr<int8_t>();
+    const int32_t* bias_data = bias.const_data_ptr<int32_t>();
+    int8_t* out_data = out.mutable_data_ptr<int8_t>();
+
+    const int32_t in_zp = static_cast<int32_t>(src_zero_point);
+    const int32_t weight_zp = static_cast<int32_t>(weight_zero_point);
+    const int32_t out_zp = static_cast<int32_t>(out_zero_point);
+
+    // Compute requant scale
+    const float requant_scale =
+        -out_multiplier * 1.0f / (1 << 31) * std::pow(2.0f, (float)out_shift);
+
+    // Check if DRAM buffers are available for DMA
+    bool dram0_available = (ptr_dram0 != nullptr) && (IDMA_BUFFER_SIZE_DRAM0 > 0);
+    bool dram1_available = (ptr_dram1 != nullptr) && (IDMA_BUFFER_SIZE_DRAM1 > 0);
+
+    // DMA threshold: only beneficial for larger problems
+    const size_t DMA_THRESHOLD = 512;
+    bool use_dma = (in_dim >= DMA_THRESHOLD) && dram0_available && dram1_available;
+
+    if (use_dma && leading_dims == 1) {
+      // Single sample: DMA-optimized tiling (block prefetch) processing
+      size_t input_buffer_size = in_dim;
+      size_t max_tile_rows = IDMA_BUFFER_SIZE_DRAM1 / in_dim;
+      if (max_tile_rows == 0) max_tile_rows = 1;
+      size_t tile_rows = (max_tile_rows < out_dim) ? max_tile_rows : out_dim;
+
+      int8_t* input_cache = (int8_t*)ptr_dram0;
+      int8_t* weight_tile = (int8_t*)ptr_dram1;
+
+      xthal_dcache_region_writeback((void*)in_data, sizeof(int8_t) * src.numel());
+      xthal_dcache_region_writeback((void*)weight_data, sizeof(int8_t) * weight.numel());
+
+      dma_2dm_init(0);
+      int32_t idx_in = idma_copy_2d_desc(0, input_cache, (void*)in_data,
+                                        input_buffer_size, DESC_IDMA_PRIOR_H, 1, 0, 0);
+      idma_desc_done(0, idx_in);
+
+      for (size_t j_tile = 0; j_tile < out_dim; j_tile += tile_rows) {
+        size_t curr_tile = ((j_tile + tile_rows) <= out_dim) ? tile_rows : (out_dim - j_tile);
+        int32_t idx_weight = idma_copy_2d_desc(0, weight_tile, (void*)(weight_data + j_tile * in_dim),
+                                              curr_tile * in_dim, DESC_IDMA_PRIOR_H, 1, 0, 0);
+        idma_desc_done(0, idx_weight);
+
+        for (size_t j = 0; j < curr_tile; ++j) {
+          int32_t acc = bias_data[j_tile + j];
+          acc = rvdot_zeropt(
+              acc, input_cache, weight_tile + j * in_dim,
+              in_zp, weight_zp, (int)in_dim);
+          out_data[j_tile + j] = ::impl::generic::kernels::quantize<int8_t>(acc, requant_scale, out_zp);
+        }
+      }
+
+      xthal_dcache_region_writeback(out_data, sizeof(int8_t) * numel);
+
+      return;
+    }
+
+    // Fallback: No DMA or multi-sample - use direct SIMD
+    xthal_dcache_region_writeback((void*)in_data, sizeof(int8_t) * src.numel());
+    xthal_dcache_region_invalidate((void*)in_data, sizeof(int8_t) * src.numel());
+    xthal_dcache_region_writeback((void*)weight_data, sizeof(int8_t) * weight.numel());
+    xthal_dcache_region_invalidate((void*)weight_data, sizeof(int8_t) * weight.numel());
+    xthal_dcache_region_writeback((void*)bias_data, sizeof(int32_t) * bias.numel());
+    xthal_dcache_region_invalidate((void*)bias_data, sizeof(int32_t) * bias.numel());
+
+    for (size_t i = 0; i < leading_dims; ++i) {
+      const int8_t* in_row = &in_data[i * in_dim];
+      for (size_t j = 0; j < out_dim; ++j) {
+        const int8_t* weight_row = &weight_data[j * in_dim];
+        int32_t acc = bias_data[j];
+        acc = rvdot_zeropt(
+            acc, in_row, weight_row,
+            in_zp, weight_zp, (int)in_dim);
+        out_data[i * out_dim + j] =
+            ::impl::generic::kernels::quantize<int8_t>(acc, requant_scale, out_zp);
+      }
+    }
+
+    xthal_dcache_region_writeback(out_data, sizeof(int8_t) * numel);
+
+
+  } else {
+    // Fallback: use generic implementation
+    if (out.scalar_type() == ScalarType::Char) {
+      quantized_linear_per_tensor_generic_<int8_t>(
+          src, weight, bias,
+          src_zero_point, weight_zero_point,
+          out_multiplier, out_shift, out_zero_point, out);
+    } else if (out.scalar_type() == ScalarType::Byte) {
+      quantized_linear_per_tensor_generic_<uint8_t>(
+          src, weight, bias,
+          src_zero_point, weight_zero_point,
+          out_multiplier, out_shift, out_zero_point, out);
+    } else {
+      ET_CHECK_MSG(
+          false,
+          "Unhandled output dtype %hhd",
+          static_cast<int8_t>(out.scalar_type()));
+    }
+
   }
-#undef typed_quantized_linear_per_tensor
 }
 
-}; // namespace native
-}; // namespace vision
-}; // namespace impl
+// Wrapper functions for different quantization schemes
+void quantized_linear_asym8sxasym8s_asym8s_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& src,
+    const Tensor& weight,
+    const Tensor& bias,
+    const int64_t src_zero_point,
+    const int64_t weight_zero_point,
+    const int64_t out_multiplier,
+    const int64_t out_shift,
+    const int64_t out_zero_point,
+    const std::optional<Tensor>& offset,
+    Tensor& out) {
+  quantized_linear_per_tensor_out(
+      ctx, src, weight, bias,
+      src_zero_point, weight_zero_point,
+      out_multiplier, out_shift, out_zero_point,
+      offset, out);
+}
+
+void quantized_linear_asym8uxasym8u_asym8u_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& src,
+    const Tensor& weight,
+    const Tensor& bias,
+    const int64_t src_zero_point,
+    const int64_t weight_zero_point,
+    const int64_t out_multiplier,
+    const int64_t out_shift,
+    const int64_t out_zero_point,
+    const std::optional<Tensor>& offset,
+    Tensor& out) {
+  quantized_linear_per_tensor_out(
+      ctx, src, weight, bias,
+      src_zero_point, weight_zero_point,
+      out_multiplier, out_shift, out_zero_point,
+      offset, out);
+}
+
+} // namespace native
+} // namespace vision
+} // namespace impl
diff --git a/backends/cadence/vision/operators/op_quantized_relu_out.cpp b/backends/cadence/vision/operators/op_quantized_relu_out.cpp
index 45b9e09b1dd..812c33873ed 100644
--- a/backends/cadence/vision/operators/op_quantized_relu_out.cpp
+++ b/backends/cadence/vision/operators/op_quantized_relu_out.cpp
@@ -6,109 +6,348 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/vision/kernels/kernels.h>
-#include <executorch/backends/cadence/vision/operators/operators.h>
+#include <lib.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <executorch/backends/cadence/generic/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
+
+// Forward declaration of Vision SIMD quantized ReLU
+extern "C" void vrelU(
+    uint8_t* y,
+    const int8_t* x,
+    const uint8_t minVal,
+    uint8_t maxVal,
+    int N);
+
+#define ET_FORALL_CADENCE_QUANTIZED_TYPES(_) \
+  _(uint8_t, Byte)                           \
+  _(int8_t, Char)
 
 namespace impl {
 namespace vision {
 namespace native {
 
-using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
-
+// Generic fallback implementation (from generic/operators/quantized_relu_out.cpp)
 template <typename T>
-void quantized_relu_(
+void quantized_relu_per_tensor_out_(
+    __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
-    const Tensor& in_zero_point,
-    const int64_t out_zero_point,
-    const Tensor& out_multiplier,
-    const Tensor& out_shift,
+    int64_t in_zero_point,
+    int64_t out_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
     Tensor& output) {
-  T q_zero_point = in_zero_point.const_data_ptr<T>()[0];
   const T* __restrict__ in = input.const_data_ptr<T>();
   T* __restrict__ out = output.mutable_data_ptr<T>();
 
-  const int32_t* __restrict__ out_multiplier_data =
-      out_multiplier.const_data_ptr<int32_t>();
-  const int32_t* __restrict__ out_shift_data =
-      out_shift.const_data_ptr<int32_t>();
-
   // Compute the out_scale from out_multiplier and out_shift
-  const float out_scale =
-      -out_multiplier_data[0] * 1.0 / (1 << 31) * pow(2, out_shift_data[0]);
+  const float out_scale = -out_multiplier * 1.0 / (1 << 31) * pow(2, out_shift);
 
   for (size_t i = 0, e = input.numel(); i < e; ++i) {
-    const T temp = in[i] > q_zero_point ? (in[i] - q_zero_point) : 0;
-    out[i] = kernels::quantize<T>(temp, out_scale, out_zero_point);
+    const float temp = in[i] > in_zero_point ? (in[i] - in_zero_point) : 0;
+    out[i] = generic::kernels::quantize<T>(temp, out_scale, out_zero_point);
   }
 }
 
-void quantized_relu_out(
+
+void quantized_relu_per_tensor_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
-    const Tensor& in_zero_point,
+    const int64_t in_zero_point,
     const int64_t out_zero_point,
-    const Tensor& out_multiplier,
-    const Tensor& out_shift,
+    const int64_t out_multiplier,
+    const int64_t out_shift,
     Tensor& output) {
-  if (input.scalar_type() == executorch::aten::ScalarType::Byte) {
-    quantized_relu_<uint8_t>(
-        input,
-        in_zero_point,
-        out_zero_point,
-        out_multiplier,
-        out_shift,
-        output);
-  } else if (input.scalar_type() == executorch::aten::ScalarType::Char) {
-    quantized_relu_<int8_t>(
-        input,
-        in_zero_point,
-        out_zero_point,
-        out_multiplier,
-        out_shift,
-        output);
+  
+
+  size_t numel = input.numel();
+  
+  // Check if we can use Vision SIMD path for quantized data
+  // vrelU supports int8/uint8 input and output (with appropriate casting)
+  bool use_optimized = (input.scalar_type() == ScalarType::Char || input.scalar_type() == ScalarType::Byte) &&
+                       (output.scalar_type() == ScalarType::Char || output.scalar_type() == ScalarType::Byte) &&
+                       (numel >= 16);
+
+  if (use_optimized) {
+    // Vision-optimized SIMD path using vrelU with iDMA support
+    // vrelU requires int8_t* input and uint8_t* output, cast appropriately
+    const int8_t* in_data;
+    if (input.scalar_type() == ScalarType::Char) {
+      in_data = input.const_data_ptr<int8_t>();
+    } else {
+      in_data = reinterpret_cast<const int8_t*>(input.const_data_ptr<uint8_t>());
+    }
+    
+    uint8_t* out_data;
+    if (output.scalar_type() == ScalarType::Byte) {
+      out_data = output.mutable_data_ptr<uint8_t>();
+    } else {
+      out_data = reinterpret_cast<uint8_t*>(output.mutable_data_ptr<int8_t>());
+    }
+    
+    // For quantized operations and dumps, we need int8_t view of output
+    int8_t* out_data_int8 = reinterpret_cast<int8_t*>(out_data);
+    
+    // vrelU clamps: max(max(x, 0), minVal) and min(result, maxVal)
+    uint8_t minVal = 0;      // ReLU minimum is 0
+    uint8_t maxVal = 255;    // uint8 max
+    
+    // Common quantization parameters (used by both DMA and non-DMA paths)
+    const float out_scale = -out_multiplier * 1.0f / (1 << 31) * std::pow(2.0f, (float)out_shift);
+    const int32_t in_zp = static_cast<int32_t>(in_zero_point);
+    const int32_t out_zp = static_cast<int32_t>(out_zero_point);
+    
+    // DMA setup
+    bool ping_pong_process = false;
+    bool ping_process_pong = false;
+    size_t chunk_size = 0;
+
+    int8_t* inp_buff[2];
+    uint8_t* out_buff[2];
+
+    // Check if DRAM buffers are available
+    bool dram0_available = (ptr_dram0 != nullptr) && (IDMA_BUFFER_SIZE_DRAM0 > 0);
+    bool dram1_available = (ptr_dram1 != nullptr) && (IDMA_BUFFER_SIZE_DRAM1 > 0);
+    
+    // DMA has overhead - only beneficial for larger tensors
+    // Threshold: 1024 elements (~1KB for int8 input/output)
+    const size_t DMA_THRESHOLD = 1024;
+    bool use_dma = (numel >= DMA_THRESHOLD);
+
+    // Strategy 1: Try ping-pong processing (2 input + 2 output buffers)
+    // Using 50/50 split: both int8/uint8 are 1 byte each
+    if (use_dma && dram0_available && dram1_available && (numel >= 2)) {
+      size_t per_buffer = (IDMA_BUFFER_SIZE_DRAM0 / 2);  // 50% for int8 input (in bytes)
+      
+      // Check if 50/50 split fits in both DRAMs
+      if ((per_buffer > 0) && 
+          ((IDMA_BUFFER_SIZE_DRAM0 / 2 + IDMA_BUFFER_SIZE_DRAM0 / 2) <= IDMA_BUFFER_SIZE_DRAM0) &&
+          ((IDMA_BUFFER_SIZE_DRAM1 / 2 + IDMA_BUFFER_SIZE_DRAM1 / 2) <= IDMA_BUFFER_SIZE_DRAM1)) {
+        
+        // Allocate buffers with 50/50 split
+        inp_buff[0] = (int8_t*)ptr_dram0;
+        out_buff[0] = (uint8_t*)((uint8_t*)ptr_dram0 + (IDMA_BUFFER_SIZE_DRAM0 / 2));
+        
+        inp_buff[1] = (int8_t*)ptr_dram1;
+        out_buff[1] = (uint8_t*)((uint8_t*)ptr_dram1 + (IDMA_BUFFER_SIZE_DRAM1 / 2));
+        
+        chunk_size = per_buffer;
+        ping_pong_process = true;
+      }
+    }
+    
+    // Strategy 2: Fallback to ping-process-pong (1 input + 1 output buffer)
+    // Use full DRAM0 for input, full DRAM1 for output (no split needed)
+    if (use_dma && !ping_pong_process && dram0_available && dram1_available) {
+      size_t inp_capacity = IDMA_BUFFER_SIZE_DRAM0;  // Full DRAM0 for int8 input (in bytes)
+      size_t out_capacity = IDMA_BUFFER_SIZE_DRAM1;  // Full DRAM1 for uint8 output (in bytes)
+      
+      if ((inp_capacity > 0) && (out_capacity >= inp_capacity)) {
+        inp_buff[0] = (int8_t*)ptr_dram0;
+        out_buff[0] = (uint8_t*)ptr_dram1;
+        
+        chunk_size = (inp_capacity < out_capacity) ? inp_capacity : out_capacity;
+        ping_process_pong = true;
+      }
+    }
+
+    if (ping_pong_process || ping_process_pong) {
+      const int8_t* ptr_inp = in_data;
+
+      // Writeback input from cache to system memory before DMA reads
+      xthal_dcache_region_writeback((void*)in_data, sizeof(int8_t) * numel);
+
+      /* Initialize DMA Channel 0 (loads) and Channel 1 (stores) */
+      dma_2dm_init(0);
+      dma_2dm_init(1);
+
+      if (ping_pong_process) {
+        // Ping-pong processing for better throughput
+        size_t num_chunks = (numel + chunk_size - 1) / chunk_size;
+        
+        if (num_chunks == 0) num_chunks = 1;
+
+        int32_t pp_swap = 0;
+
+        int8_t* ptr_in = (int8_t*)ptr_inp;
+        uint8_t* ptr_out = out_data;
+
+        // Load first chunk via ch0
+        size_t current_chunk = (numel < chunk_size) ? numel : chunk_size;
+
+        dma_1dm(0, ptr_in, inp_buff[pp_swap], sizeof(int8_t) * current_chunk);
+
+        size_t remaining = numel - current_chunk;
+        ptr_in += current_chunk;
+
+        // Pipeline: load (ch0) and store (ch1) overlap with processing
+        for (size_t i = 0; i < (num_chunks - 1); i++) {
+          size_t next_chunk = (remaining < chunk_size) ? remaining : chunk_size;
+
+          // Wait for current load to complete
+          idma_hw_wait_all(0);
+
+          // Start loading next chunk into alternate buffer via ch0
+          dma_1dm(0, ptr_in, inp_buff[pp_swap ^ 1], sizeof(int8_t) * next_chunk);
+
+          // Process current chunk (ch0 loads next in parallel)
+          int8_t* out_chunk_int8 = reinterpret_cast<int8_t*>(out_buff[pp_swap]);
+          vrelU_quantized(out_chunk_int8, inp_buff[pp_swap], in_zp, out_zp, out_scale, (int)current_chunk);
+
+          // Wait for previous store to complete before reusing out_buff
+          idma_hw_wait_all(1);
+
+          // Store result via ch1
+          dma_1dm(1, out_buff[pp_swap], ptr_out, sizeof(uint8_t) * current_chunk);
+
+          ptr_in += next_chunk;
+          ptr_out += current_chunk;
+          remaining -= next_chunk;
+          current_chunk = next_chunk;
+          pp_swap ^= 1;
+        }
+
+        // Process last chunk
+        idma_hw_wait_all(0);
+        int8_t* out_last_int8 = reinterpret_cast<int8_t*>(out_buff[pp_swap]);
+        vrelU_quantized(out_last_int8, inp_buff[pp_swap], in_zp, out_zp, out_scale, (int)current_chunk);
+
+        idma_hw_wait_all(1);
+        dma_1dm(1, out_buff[pp_swap], ptr_out, sizeof(uint8_t) * current_chunk);
+        idma_hw_wait_all(1);
+
+        // Invalidate output cache: DMA wrote to system memory, cache may have stale data
+        xthal_dcache_region_invalidate(out_data, sizeof(uint8_t) * numel);
+        
+      } 
+      else if (ping_process_pong) {
+        // Simple sequential processing
+        size_t remaining = numel;
+        int8_t* ptr_in = (int8_t*)ptr_inp;
+        uint8_t* ptr_out = out_data;
+
+        while (remaining > 0) {
+          size_t current_chunk = (remaining < chunk_size) ? remaining : chunk_size;
+
+          // Start load via ch0 (overlaps with any pending ch1 store)
+          dma_1dm(0, ptr_in, inp_buff[0], sizeof(int8_t) * current_chunk);
+          // Wait for previous store to complete (out_buff[0] safe to write)
+          idma_hw_wait_all(1);
+          // Wait for load to complete
+          idma_hw_wait_all(0);
+
+          // Process
+          int8_t* out_chunk_int8 = reinterpret_cast<int8_t*>(out_buff[0]);
+          vrelU_quantized(out_chunk_int8, inp_buff[0], in_zp, out_zp, out_scale, (int)current_chunk);
+
+          // Store via ch1
+          dma_1dm(1, out_buff[0], ptr_out, sizeof(uint8_t) * current_chunk);
+
+          ptr_in += current_chunk;
+          ptr_out += current_chunk;
+          remaining -= current_chunk;
+        }
+        idma_hw_wait_all(1);
+
+        // Invalidate output cache: DMA wrote to system memory, cache may have stale data
+        xthal_dcache_region_invalidate(out_data, sizeof(uint8_t) * numel);
+        
+      }
+    } else {
+      // Fallback: use SIMD function directly without DMA
+      // Writeback+invalidate input: ensures CPU-dirty data reaches system memory,
+      // then invalidate forces re-read from system memory (fresh data)
+      xthal_dcache_region_writeback((void*)in_data, sizeof(int8_t) * numel);
+      xthal_dcache_region_invalidate((void*)in_data, sizeof(int8_t) * numel);
+      // Use common parameters already computed above
+      
+      vrelU_quantized(
+          out_data_int8,
+          in_data,
+          in_zp,
+          out_zp,
+          out_scale,
+          (int)numel);
+
+      // Writeback output from cache to system memory for DMA coherency
+      xthal_dcache_region_writeback(out_data, sizeof(uint8_t) * numel);
+
+    }
+    
   } else {
-    ET_CHECK_MSG(
-        false,
-        "Unhandled input dtype %hhd",
-        static_cast<int8_t>(input.scalar_type()));
+    // Fallback: use generic implementation with template dispatching
+    
+#define typed_quantized_relu(ctype, dtype)    \
+  case executorch::aten::ScalarType::dtype: { \
+    quantized_relu_per_tensor_out_<ctype>(    \
+        ctx,                                  \
+        input,                                \
+        in_zero_point,                        \
+        out_zero_point,                       \
+        out_multiplier,                       \
+        out_shift,                            \
+        output);                              \
+    break;                                    \
+  }
+
+    executorch::aten::ScalarType dtype = input.scalar_type();
+    switch (dtype) {
+      ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_relu)
+      default:
+        ET_DCHECK_MSG(
+            false, "Unhandled dtype %s", torch::executor::toString(dtype));
+    }
+
+#undef typed_quantized_relu
+    
   }
 }
 
+// Per-channel quantized_relu_out (with Tensor parameters)
 template <typename T>
-void quantized_relu_per_tensor_out_(
-    __ET_UNUSED KernelRuntimeContext& ctx,
+void quantized_relu_(
     const Tensor& input,
-    const int64_t in_zero_point,
+    const Tensor& in_zero_point,
     const int64_t out_zero_point,
-    const int64_t out_multiplier,
-    const int64_t out_shift,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
     Tensor& output) {
+  T q_zero_point = in_zero_point.const_data_ptr<T>()[0];
   const T* __restrict__ in = input.const_data_ptr<T>();
   T* __restrict__ out = output.mutable_data_ptr<T>();
 
-  // Compute the out_scale from out_multiplier and out_shift
-  const float out_scale = -out_multiplier * 1.0 / (1 << 31) * pow(2, out_shift);
+  const int32_t* __restrict__ out_multiplier_data =
+      out_multiplier.const_data_ptr<int32_t>();
+  const int32_t* __restrict__ out_shift_data =
+      out_shift.const_data_ptr<int32_t>();
+
+  const float out_scale =
+      -out_multiplier_data[0] * 1.0 / (1 << 31) * pow(2, out_shift_data[0]);
 
   for (size_t i = 0, e = input.numel(); i < e; ++i) {
-    const float temp = in[i] > in_zero_point ? (in[i] - in_zero_point) : 0;
-    out[i] = kernels::quantize<T>(temp, out_scale, out_zero_point);
+    const T temp = in[i] > q_zero_point ? (in[i] - q_zero_point) : 0;
+    out[i] = generic::kernels::quantize<T>(temp, out_scale, out_zero_point);
   }
 }
 
-void quantized_relu_per_tensor_out(
+Tensor& quantized_relu_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
-    const int64_t in_zero_point,
+    const Tensor& in_zero_point,
     const int64_t out_zero_point,
-    const int64_t out_multiplier,
-    const int64_t out_shift,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
     Tensor& output) {
-#define typed_quantized_relu(ctype, dtype)    \
+#define typed_quantized_relu_ch(ctype, dtype) \
   case executorch::aten::ScalarType::dtype: { \
-    quantized_relu_per_tensor_out_<ctype>(    \
-        ctx,                                  \
+    quantized_relu_<ctype>(                   \
         input,                                \
         in_zero_point,                        \
         out_zero_point,                       \
@@ -120,15 +359,16 @@ void quantized_relu_per_tensor_out(
 
   executorch::aten::ScalarType dtype = input.scalar_type();
   switch (dtype) {
-    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_relu)
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_relu_ch)
     default:
       ET_DCHECK_MSG(
           false, "Unhandled dtype %s", torch::executor::toString(dtype));
   }
 
-#undef typed_quantized_relu
+#undef typed_quantized_relu_ch
+  return output;
 }
 
-}; // namespace native
-}; // namespace vision
-}; // namespace impl
+} // namespace native
+} // namespace vision
+} // namespace impl
diff --git a/backends/cadence/vision/operators/op_softmax.cpp b/backends/cadence/vision/operators/op_softmax.cpp
index 58ca33c6a0b..3faf2bcd307 100644
--- a/backends/cadence/vision/operators/op_softmax.cpp
+++ b/backends/cadence/vision/operators/op_softmax.cpp
@@ -6,14 +6,11 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/vision/kernels/kernels.h>
+#include <lib.h>
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include <include/api.h>
-#include <include_private/idma_init.h>
-#include <stdio.h>
 
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
@@ -30,6 +27,8 @@ Tensor& _softmax_out(
     int64_t dim,
     bool half_to_float,
     Tensor& out) {
+  
+
   (void)ctx;
 
   ET_KERNEL_CHECK(
@@ -42,9 +41,9 @@ Tensor& _softmax_out(
       ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
 
   ET_KERNEL_CHECK(
-      ctx,
-      executorch::runtime::tensors_have_same_dim_order(in, out),
-      InvalidArgument,
+      ctx, 
+      executorch::runtime::tensors_have_same_dim_order(in, out), 
+      InvalidArgument, 
       out);
 
   // Adjust for negative dim
@@ -65,11 +64,73 @@ Tensor& _softmax_out(
   bool ping_pong_process = false;
   bool ping_process_pong = false;
 
-  if ((d == in.dim() - 1)) {
-    if (size <= IDMA_BUFF_SIZE / 4 && in.dim() != 1) {
-      ping_pong_process = true;
-    } else if (size <= IDMA_BUFF_SIZE / 2) {
-      ping_process_pong = true;
+  float32_t *inp_buff[2];
+  float32_t *out_buff[2];
+
+  if ((d == in.dim() - 1)){
+    if ((4 * FLT32_SIZE * size <= (IDMA_BUFFER_SIZE_DRAM0 + IDMA_BUFFER_SIZE_DRAM1)) && (in.dim() != 1)){
+      // For ping-pong processing we need to have enough buffer to hold 2 input and 2 output blocks
+      if (2 * FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM0 &&  2 * FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM1){
+        // Both DRAM0 and DRAM1 can hold 2 input and 2 output blocks
+        inp_buff[0] = (float32_t *)ptr_dram0;
+        inp_buff[1] = (float32_t *)ptr_dram1;
+        out_buff[0] = (float32_t *)(ptr_dram0) + size;
+        out_buff[1] = (float32_t *)(ptr_dram1) + size;
+        ping_pong_process = true;
+      }
+      else if (4 * FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM0){
+        // DRAM0 can hold 2 input and 2 output blocks
+        inp_buff[0] = (float32_t *)ptr_dram0;
+        inp_buff[1] = (float32_t *)(ptr_dram0) + size; 
+        out_buff[0] = (float32_t *)(ptr_dram0) + 2 * size;
+        out_buff[1] = (float32_t *)(ptr_dram0) + 3 * size;
+        ping_pong_process = true;
+      }
+      else if (4 * FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM1){
+        // DRAM1 can hold 2 input and 2 output blocks
+        inp_buff[0] = (float32_t *)ptr_dram1;
+        inp_buff[1] = (float32_t *)(ptr_dram1) + size; 
+        out_buff[0] = (float32_t *)(ptr_dram1) + 2 * size;
+        out_buff[1] = (float32_t *)(ptr_dram1) + 3 * size;
+        ping_pong_process = true;
+      }
+      else if (3 * FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM0 && FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM1){
+        // DRAM0 can hold 2 output and 1 input blocks, DRAM1 can hold 1 input block
+        inp_buff[0] = (float32_t *)ptr_dram0;
+        inp_buff[1] = (float32_t *)ptr_dram1;
+        out_buff[0] = (float32_t *)(ptr_dram0) + size;
+        out_buff[1] = (float32_t *)(ptr_dram0) + 2 * size;
+        ping_pong_process = true;
+      }
+      else if (FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM0 && 3 * FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM1){
+        // DRAM1 can hold 2 output and 1 input blocks, DRAM0 can hold 1 input block
+        inp_buff[0] = (float32_t *)ptr_dram0;
+        inp_buff[1] = (float32_t *)ptr_dram1;
+        out_buff[0] = (float32_t *)(ptr_dram1) + size;
+        out_buff[1] = (float32_t *)(ptr_dram1) + 2 * size;
+        ping_pong_process = true;
+      }
+    }
+    else if (2 * FLT32_SIZE * size <= (IDMA_BUFFER_SIZE_DRAM0 + IDMA_BUFFER_SIZE_DRAM1)){
+      // For ping-process-pong we need to have enough buffer to hold 1 input and 1 output block
+      if (FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM0 && FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM1){
+        // Both DRAM0 and DRAM1 can hold 1 input and 1 output block
+        inp_buff[0] = (float32_t *)ptr_dram0;
+        out_buff[0] = (float32_t *)ptr_dram1;
+        ping_process_pong = true;
+      }
+      else if (2 * FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM0){
+        // DRAM0 can hold 1 input and 1 output block
+        inp_buff[0] = (float32_t *)ptr_dram0;
+        out_buff[0] = (float32_t *)(ptr_dram0) + size;
+        ping_process_pong = true;
+      }
+      else if (2 * FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM1){
+        // DRAM1 can hold 1 input and 1 output block
+        inp_buff[0] = (float32_t *)ptr_dram1;
+        out_buff[0] = (float32_t *)(ptr_dram1) + size;
+        ping_process_pong = true;
+      }
     }
   }
 
@@ -79,20 +140,16 @@ Tensor& _softmax_out(
   if (in.dim() > MaxDim)
     optimized = false;
 
-  if (optimized) {
-    const float* ptr_inp = (float*)in.const_data_ptr<float>();
-    float* out_data = (float*)out.mutable_data_ptr<float>();
-
-    /* Channel 0*/
-    idma_init(0, 0, MAX_BLOCK_16, 8, TICK_CYCLES_1, 0, NULL);
-    idma_init_loop(0, descbuf[0], IDMA_2D_DESC, 1, NULL, NULL);
+  if (optimized){
+    const float32_t *ptr_inp = (float32_t *)in.const_data_ptr<float>();
+    float32_t *out_data = (float32_t *)out.mutable_data_ptr<float>();
 
-    /* Channel 1*/
-    idma_init(1, 0, MAX_BLOCK_16, 8, TICK_CYCLES_1, 0, NULL);
-    idma_init_loop(1, descbuf[1], IDMA_2D_DESC, 1, NULL, NULL);
+    /* Initialize DMA Channel 0 (loads) and Channel 1 (stores) */
+    dma_2dm_init(0);
+    dma_2dm_init(1);
 
     if (ping_pong_process) {
-      for (int i = 0; i < in.dim(); i++) {
+      for (int i = 0; i < in.dim(); i++){
         if (i != d)
           outer_size *= in.size(i);
       }
@@ -100,60 +157,47 @@ Tensor& _softmax_out(
       outer_stride = size;
       stride = size;
 
-      int pp_swap = 0;
+      int32_t pp_swap = 0;
 
-      float32_t* ptr_out = out_data;
-      float32_t* ptr_in = (float32_t*)ptr_inp;
+	    float32_t *ptr_out = out_data;
+	    float32_t *ptr_in = (float32_t *) ptr_inp;
 
-      idma_copy_2d_desc(
-          0, inpData[pp_swap], ptr_in, 4 * stride, DESC_IDMA_PRIOR_H, 1, 0, 0);
-      pp_swap = 1;
+      // Load first chunk via ch0
+      dma_1dm(0, ptr_in, inp_buff[pp_swap], 4 * stride);
 
-      for (int i = 0; i < (outer_size - 1); i++) {
-        IDMA_HW_WAIT_ALL(0);
-        ptr_in += outer_stride;
-        idma_copy_2d_desc(
-            0,
-            inpData[pp_swap],
-            ptr_in,
-            4 * stride,
-            DESC_IDMA_PRIOR_H,
-            1,
-            0,
-            0);
-        pp_swap = pp_swap ^ 1;
-
-        /* PROCESS CALL */
-        vsoftmaxf(outData[pp_swap], inpData[pp_swap], stride);
-
-        IDMA_HW_WAIT_ALL(1);
-        idma_copy_2d_desc(
-            1,
-            ptr_out,
-            outData[pp_swap],
-            4 * stride,
-            DESC_IDMA_PRIOR_H,
-            1,
-            0,
-            0);
-        ptr_out += outer_stride;
-      }
+      for (int i = 0; i < (outer_size - 1); i++){
+          // Wait for current load to complete
+          idma_hw_wait_all(0);
+
+          ptr_in += outer_stride;
+          // Start loading next chunk into alternate buffer via ch0
+          dma_1dm(0, ptr_in, inp_buff[pp_swap ^ 1], 4 * stride);
+
+          /* PROCESS CALL */
+          vsoftmaxf(out_buff[pp_swap], inp_buff[pp_swap], stride);
 
-      IDMA_HW_WAIT_ALL(0);
-      pp_swap = pp_swap ^ 1;
+          // Wait for previous store to complete before reusing out_buff
+          idma_hw_wait_all(1);
+
+          // Store result via ch1
+          dma_1dm(1, out_buff[pp_swap], ptr_out, 4 * stride);
+          ptr_out += outer_stride;
+
+          pp_swap ^= 1;
+        }
 
-      /* PROCESS CALL */
-      vsoftmaxf(outData[pp_swap], inpData[pp_swap], stride);
+      // Process last chunk
+      idma_hw_wait_all(0);
+      vsoftmaxf(out_buff[pp_swap], inp_buff[pp_swap], stride);
 
-      IDMA_HW_WAIT_ALL(1);
-      idma_copy_2d_desc(
-          1, ptr_out, outData[pp_swap], 4 * stride, DESC_IDMA_PRIOR_H, 1, 0, 0);
+      idma_hw_wait_all(1);
+      dma_1dm(1, out_buff[pp_swap], ptr_out, 4 * stride);
+      idma_hw_wait_all(1);
 
-      IDMA_HW_WAIT_ALL(1);
 
       return out;
     } else if (ping_process_pong) {
-      for (int i = 0; i < in.dim(); i++) {
+      for (int i = 0; i < in.dim(); i++){
         if (i != d)
           outer_size *= in.size(i);
       }
@@ -161,23 +205,27 @@ Tensor& _softmax_out(
       outer_stride = size;
       stride = size;
 
-      float32_t* ptr_out = out_data;
-      float32_t* ptr_in = (float32_t*)ptr_inp;
+	    float32_t *ptr_out = out_data;
+	    float32_t *ptr_in = (float32_t *) ptr_inp;
 
-      for (int i = 0; i < outer_size; i++) {
-        idma_copy_2d_desc(
-            0, data_dram0, ptr_in, 4 * stride, DESC_IDMA_PRIOR_H, 1, 0, 0);
-        IDMA_HW_WAIT_ALL(0);
+	    for (int i = 0; i < outer_size; i++){
+        // Start load via ch0 (overlaps with any pending ch1 store)
+        dma_1dm(0, ptr_in, inp_buff[0], 4 * stride);
+        // Wait for previous store to complete
+        idma_hw_wait_all(1);
+        // Wait for load to complete
+        idma_hw_wait_all(0);
 
-        vsoftmaxf(data_dram1, data_dram0, stride);
+		    vsoftmaxf(out_buff[0], inp_buff[0], stride);
 
-        idma_copy_2d_desc(
-            1, ptr_out, data_dram1, 4 * stride, DESC_IDMA_PRIOR_H, 1, 0, 0);
-        IDMA_HW_WAIT_ALL(1);
+        // Store via ch1
+        dma_1dm(1, out_buff[0], ptr_out, 4 * stride);
 
         ptr_in += outer_stride;
-        ptr_out += outer_stride;
-      }
+		    ptr_out += outer_stride;
+	    }
+      idma_hw_wait_all(1);
+
 
       return out;
     } else {
@@ -207,45 +255,51 @@ Tensor& _softmax_out(
 
       outer_stride = size;
 
-      float* ptr_out = (float*)kernels::allocate_temp_memory(
-          ctx, out.numel() * sizeof(float));
+      executorch::runtime::Result<void*> temp_mem_res = ctx.allocate_temp(out.numel() * sizeof(float));
+      float* ptr_out =
+          (float*)(temp_mem_res.ok() ? temp_mem_res.get() : nullptr);
 
       ET_KERNEL_CHECK(ctx, ptr_out != nullptr, MemoryAllocationFailed, out);
 
-      float* ptr_out1 = (float*)kernels::allocate_temp_memory(
-          ctx, out.numel() * sizeof(float));
+      executorch::runtime::Result<void*> temp_mem_res1 = ctx.allocate_temp(out.numel() * sizeof(float));
+      float* ptr_out1 =
+          (float*)(temp_mem_res1.ok() ? temp_mem_res1.get() : nullptr);
 
       ET_KERNEL_CHECK(ctx, ptr_out1 != nullptr, MemoryAllocationFailed, out);
 
       tensor_transposef(
-          ptr_out,
-          ptr_out_shape,
-          ptr_inp,
-          ptr_inp_shape,
-          ptr_permute_vec,
-          num_out_dims,
-          num_inp_dims);
+        ptr_out,
+        ptr_out_shape,
+        ptr_inp,
+        ptr_inp_shape,
+        ptr_permute_vec,
+        num_out_dims,
+        num_inp_dims);
 
       for (size_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
         size_t outer = outer_idx * outer_stride;
         for (size_t inner_idx = 0; inner_idx < stride; ++inner_idx) {
           size_t base = outer + inner_idx;
-
-          float* ptr_in_data = &ptr_out[base];
-          float* ptr_out_data = &ptr_out1[base];
+        
+          float *ptr_in_data = &ptr_out[base];
+          float *ptr_out_data = &ptr_out1[base];
 
           vsoftmaxf(ptr_out_data, ptr_in_data, size);
         }
       }
 
       tensor_transposef(
-          out_data,
-          ptr_inp_shape,
-          ptr_out1,
-          ptr_out_shape,
-          ptr_permute_vec,
-          num_out_dims,
-          num_inp_dims);
+        out_data,
+        ptr_inp_shape,
+        ptr_out1,
+        ptr_out_shape,
+        ptr_permute_vec,
+        num_out_dims,
+        num_inp_dims);
+
+      // Writeback output from cache to system memory for DMA coherency
+      xthal_dcache_region_writeback(out_data, sizeof(float) * in.numel());
+
 
       return out;
     }
@@ -270,13 +324,13 @@ Tensor& _softmax_out(
                   size,
                   stride);
 
-              const CTYPE temp_sum =
+              const CTYPE temp_sum = 
                   torch::executor::apply_unary_map_reduce_fn<CTYPE, CTYPE>(
                       [max_in](const CTYPE val_in) {
-                        return std::exp(val_in - max_in);
+                      return std::exp(val_in - max_in);
                       },
                       [](const CTYPE mapped_in, CTYPE val_accum) {
-                        return val_accum + mapped_in;
+                      return val_accum + mapped_in;
                       },
                       in_data + base,
                       size,
@@ -295,6 +349,7 @@ Tensor& _softmax_out(
             dim);
       });
 
+
   return out;
 }
 
diff --git a/backends/cadence/vision/third-party/CMakeLists.txt b/backends/cadence/vision/third-party/CMakeLists.txt
new file mode 100644
index 00000000000..12530d95322
--- /dev/null
+++ b/backends/cadence/vision/third-party/CMakeLists.txt
@@ -0,0 +1,101 @@
+cmake_minimum_required(VERSION 3.10.0)
+project(cadence_vision)
+
+# Collect all source files from the library directory
+file(GLOB_RECURSE VISION_LIB_SOURCES
+  "${CMAKE_CURRENT_SOURCE_DIR}/library/api/*.c"
+  "${CMAKE_CURRENT_SOURCE_DIR}/library/tables/*.c"
+  "${CMAKE_CURRENT_SOURCE_DIR}/library/dma.c"
+  "${CMAKE_CURRENT_SOURCE_DIR}/library/memory_manager.c"
+  "${CMAKE_CURRENT_SOURCE_DIR}/library/utils.c"
+)
+
+# Create the vision library
+add_library(xa_nnlib STATIC ${VISION_LIB_SOURCES})
+
+# Set include directories
+target_include_directories(xa_nnlib PUBLIC
+  ${CMAKE_CURRENT_SOURCE_DIR}/include
+)
+
+target_include_directories(xa_nnlib PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/include_private
+)
+
+# Set properties for the library
+set_target_properties(xa_nnlib PROPERTIES
+  OUTPUT_NAME "xa_nnlib"
+  ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/bin"
+)
+
+# Create output directories
+file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/bin")
+
+# ============================================================================
+# libxai_common - Common utilities and data types
+# ============================================================================
+set(LIBXAI_COMMON_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai_common/src/xai_buildinfo.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai_common/src/xai_errstr.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai_common/src/cnn_cast.c
+)
+
+set(LIBXAI_COMMON_INCLUDE_DIRS
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai_common/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai_common/src
+)
+
+add_library(xai_common STATIC ${LIBXAI_COMMON_SOURCES})
+target_include_directories(xai_common PUBLIC ${LIBXAI_COMMON_INCLUDE_DIRS})
+# Allow XAI kernels to operate on system memory (not just local DRAM).
+# Required for cache-mode convolution variants that pass system memory pointers.
+target_compile_definitions(xai_common PUBLIC SYS_MEM_TESTING=1)
+
+# ============================================================================
+# libxai - CNN kernels library
+# ============================================================================
+set(LIBXAI_SOURCES
+    # Main convolution dispatcher (contains xaiConvolved3D)
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_conv.c
+    # Convolution dispatcher and variants
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_conv_VQ.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_conv_MOD.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_conv_MOW.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_conv_SO.c
+    # Dilated convolution VQ variants
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_VQ_MOW.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_VQ_MOD.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_VQ_MOD_S16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_VQ_MOW_S16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_VQ_SO.c
+    # Dilated convolution non-VQ variants
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_MOW.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_MOD.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_MOD_S16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_MOW_S16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_SO.c
+    # Data transform and helpers
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_datatransform.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_helper.c
+)
+
+set(LIBXAI_INCLUDE_DIRS
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src
+)
+
+add_library(xai STATIC ${LIBXAI_SOURCES})
+target_include_directories(xai PUBLIC ${LIBXAI_INCLUDE_DIRS})
+target_include_directories(xai PRIVATE ${LIBXAI_COMMON_INCLUDE_DIRS})
+target_link_libraries(xai PUBLIC xai_common)
+
+# ============================================================================
+# Export variables for parent CMakeLists.txt
+# ============================================================================
+set(XAI_INCLUDE_DIRS
+    ${LIBXAI_INCLUDE_DIRS}
+    ${LIBXAI_COMMON_INCLUDE_DIRS}
+    CACHE INTERNAL "XAI include directories"
+)
+
+set(XAI_LIBRARIES xai xai_common CACHE INTERNAL "XAI libraries")
diff --git a/backends/cadence/vision/third-party/dummy.c b/backends/cadence/vision/third-party/dummy.c
deleted file mode 100644
index 52fb7c18c38..00000000000
--- a/backends/cadence/vision/third-party/dummy.c
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/* Dummy source file for non-Xtensa builds
- * This file is used when building the vision-nnlib library on platforms
- * other than Xtensa, providing empty stubs for compatibility.
- * The actual function implementations are provided as stubs via DISCARD_FUN
- * in headers when COMPILER_XTENSA is not defined.
- */
-
-// This file intentionally contains no function definitions and no includes.
-// When COMPILER_XTENSA is not defined, all functions are stubbed out
-// using the DISCARD_FUN macro in the header files.
diff --git a/backends/cadence/vision/third-party/include/api.h b/backends/cadence/vision/third-party/include/api.h
index efb80c3d76d..b89ab2ac263 100644
--- a/backends/cadence/vision/third-party/include/api.h
+++ b/backends/cadence/vision/third-party/include/api.h
@@ -69,12 +69,65 @@ N      multiple of BBE_SIMD_WIDTH (vsoftmax)
 void vsoftmaxf(float32_t *y, const float32_t *x, int N);
 
 void tensor_transposef(float32_t *restrict ptr_out
-    ,const int *const ptr_out_shape
-    ,const float32_t *restrict ptr_inp
-    ,const int *const ptr_inp_shape
-    ,const int *restrict ptr_permute_vec
-    ,int num_out_dims
-    ,int num_inp_dims);
+  ,const int *const ptr_out_shape
+  ,const float32_t *restrict ptr_inp
+  ,const int *const ptr_inp_shape
+  ,const int *restrict ptr_permute_vec
+  ,int num_out_dims
+  ,int num_inp_dims);
+
+void quantize_f32_asym8s(int8_t *restrict ptr_out
+  ,const float32_t *restrict ptr_inp
+  ,float32_t scale
+  ,int zero_bias
+  ,int N);
+
+void dequantize_asym8s_f32(float32_t *restrict ptr_out
+  ,const int8_t *restrict ptr_inp
+  ,float32_t scale
+  ,int zero_bias
+  ,int N);
+
+void maxpool2d_with_indices_j2x2_f32(float32_t *restrict ptr_out
+  ,const float32_t *restrict ptr_inp
+  ,int *restrict ptr_indices
+  ,int inp_height ,int inp_width
+  ,int out_height ,int out_width
+  ,int32_t in_pitch_width, int32_t in_pitch_height
+  ,int32_t out_pitch_width, int32_t out_pitch_height
+  ,uint8_t kernel_height
+  ,uint8_t kernel_width);
+
+void maxpool2d_j2x2_f32(float32_t *restrict ptr_out
+  ,const float32_t *restrict ptr_inp
+  ,int inp_height ,int inp_width
+  ,int out_height ,int out_width
+  ,int32_t in_pitch_width, int32_t in_pitch_height
+  ,int32_t out_pitch_width, int32_t out_pitch_height
+  ,uint8_t kernel_height
+  ,uint8_t kernel_width);
+
+void vrelU_quantized(
+		    int8_t* restrict ptr_out,
+		        const int8_t* restrict ptr_inp,
+			    int32_t in_zero_point,
+			        int32_t out_zero_point,
+				    float32_t out_scale,
+				        int N);
+void rvaddf(float32_t *restrict z, const float32_t *restrict x,
+		            const float32_t *restrict y, int N);
+
+void simd_mean_pool_2x2_to_1x1_float32(float32_t* restrict output, 
+                                       const float32_t* restrict input,
+                                       int N);
+
+int32_t rvdot_zeropt(
+    int32_t init_acc,
+    const int8_t *restrict x,
+    const int8_t *restrict y,
+    int8_t x_zp,
+    int8_t y_zp,
+    int N);
 
 #ifdef __cplusplus
 };
diff --git a/backends/cadence/vision/third-party/include/dma.h b/backends/cadence/vision/third-party/include/dma.h
new file mode 100644
index 00000000000..6e368bccd91
--- /dev/null
+++ b/backends/cadence/vision/third-party/include/dma.h
@@ -0,0 +1,42 @@
+/*
+ * dma.h
+ *
+ *  Created on: Oct 30, 2025
+ *      Author: sraut
+ */
+
+#ifndef __DMA_H__
+#define __DMA_H__
+
+// Enable DMA for cache-mode input copy (instead of xaiCopyTile3D)
+// NOTE: Requires AXI-to-AXI DMA support on the target core
+// Uncomment to use DMA 3D transfer in cache executors
+// #define USE_DMA_FOR_CACHE_COPY
+
+#define IDMA_USE_INTR 0
+#define IDMA_USE_MULTICHANNEL 1
+#define CHL_MAX 2
+#include <xtensa/hal.h>
+#include <xtensa/idma.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// DMA initialization functions
+void dma_2dm_init(int ch);
+void dma_3dm_init(int ch);
+
+// DMA transfer functions
+void dma_1dm(int ch,void *_psrc,void *_pdst, int num_bytes);
+void dma_2dm(int ch, void *_psrc, void *_pdst, int src_stride, int dst_stride,
+            int num_bytes, short num_lines);
+void dma_3dm(int ch, void *src, void *dst, int src_row_pitch, int dst_row_pitch,
+            int src_tile_pitch, int dst_tile_pitch, int row_sz,
+            int nrows, int ntiles) ;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __DMA_H__ */
diff --git a/backends/cadence/vision/third-party/include/dtypes.h b/backends/cadence/vision/third-party/include/dtypes.h
index c12bbf23ac2..1b94594d216 100644
--- a/backends/cadence/vision/third-party/include/dtypes.h
+++ b/backends/cadence/vision/third-party/include/dtypes.h
@@ -164,23 +164,38 @@
 #define inline_ static inline
 #endif
 
+#ifndef MAX_FLT32
+#define MAX_FLT32  (3.402823466e+38F)
+#endif
+#ifndef MIN_FLT32
+#define MIN_FLT32  (- MAX_FLT32)
+#endif
+#ifndef MIN_ABS_FLT32
+#define MIN_ABS_FLT32  (1.175494351e-38F)
+#endif
+#ifndef MAX_INT8
+#define MAX_INT8  (0x7f)
+#endif
+#ifndef MIN_INT8
+#define MIN_INT8  (- MAX_INT8 - 1)
+#endif
 #ifndef MAX_INT16
-#define MAX_INT16 ((int16_t)0x7FFF)
+#define MAX_INT16 (0x7FFF)
 #endif
 #ifndef MIN_INT16
-#define MIN_INT16 ((int16_t)0x8000)
+#define MIN_INT16 (0x8000)
 #endif
 #ifndef MAX_INT32
-#define MAX_INT32 ((int32_t)0x7FFFFFFFL)
+#define MAX_INT32 (0x7FFFFFFFL)
 #endif
 #ifndef MIN_INT32
-#define MIN_INT32 ((int32_t)0x80000000L)
+#define MIN_INT32 (0x80000000L)
 #endif
 #ifndef MIN_INT64
-#define MIN_INT64 ((int64_t)0x8000000000000000LL)
+#define MIN_INT64 (0x8000000000000000LL)
 #endif
 #ifndef MAX_INT64
-#define MAX_INT64 ((int64_t)0x7fffffffffffffffLL)
+#define MAX_INT64 (0x7fffffffffffffffLL)
 #endif
 
 /* size of variables in bytes */
@@ -190,6 +205,22 @@
 #define SIZEOF_BYTE(x) sizeof(x)
 #endif
 
+#ifndef FLT32_SIZE
+#define FLT32_SIZE  4
+#endif
+#ifndef INT8_SIZE
+#define INT8_SIZE 1
+#endif
+#ifndef INT16_SIZE
+#define INT16_SIZE 2
+#endif
+#ifndef INT32_SIZE
+#define INT32_SIZE 4
+#endif
+#ifndef INT64_SIZE
+#define INT64_SIZE 8
+#endif
+
 /*---------------------------------------
  special keywords definition
  restrict  keyword means that the memory
diff --git a/backends/cadence/vision/third-party/include/dump_tensor.h b/backends/cadence/vision/third-party/include/dump_tensor.h
new file mode 100644
index 00000000000..ab2bb219289
--- /dev/null
+++ b/backends/cadence/vision/third-party/include/dump_tensor.h
@@ -0,0 +1,70 @@
+/*
+ * Dump output tensor data after each operator for layer-by-layer comparison.
+ * Include with: #include <dump_tensor.h>
+ *
+ * Output format:
+ *   LAYER_DUMP : <op> : <numel> : dtype=<d> : first=[v0,v1,...] : sum=<s> : min=<lo> : max=<hi>
+ *
+ * Compare generic vs optimized:
+ *   grep LAYER_DUMP generic.log > gen_dump.txt
+ *   grep LAYER_DUMP opt.log     > opt_dump.txt
+ *   diff gen_dump.txt opt_dump.txt
+ */
+#pragma once
+
+#include <cstdio>
+#include <cstdint>
+#include <cfloat>
+
+/* ScalarType values: Byte=0, Char=1, Short=2, Int=3, Long=4, Half=5, Float=6 */
+
+#define _DUMP_N 16  /* number of leading values to print */
+
+#define DUMP_TENSOR(name, tensor) do { \
+    const auto _dn = (tensor).numel(); \
+    const int _dt = (int)(tensor).scalar_type(); \
+    printf("LAYER_DUMP : %s : %d : dtype=%d", #name, (int)_dn, _dt); \
+    if (_dt == 6) { /* Float */ \
+        const float* _dp = (tensor).const_data_ptr<float>(); \
+        int _k = (int)_dn < _DUMP_N ? (int)_dn : _DUMP_N; \
+        printf(" : first=["); \
+        for (int _i = 0; _i < _k; _i++) printf("%s%.6f", _i?",":"", _dp[_i]); \
+        printf("]"); \
+        double _sum = 0; float _lo = _dp[0], _hi = _dp[0]; \
+        for (int _i = 0; _i < (int)_dn; _i++) { \
+            _sum += _dp[_i]; \
+            if (_dp[_i] < _lo) _lo = _dp[_i]; \
+            if (_dp[_i] > _hi) _hi = _dp[_i]; \
+        } \
+        printf(" : sum=%.4f : min=%.6f : max=%.6f", _sum, _lo, _hi); \
+    } else if (_dt == 1) { /* Char / int8 */ \
+        const int8_t* _dp = (tensor).const_data_ptr<int8_t>(); \
+        int _k = (int)_dn < _DUMP_N ? (int)_dn : _DUMP_N; \
+        printf(" : first=["); \
+        for (int _i = 0; _i < _k; _i++) printf("%s%d", _i?",":"", (int)_dp[_i]); \
+        printf("]"); \
+        int64_t _sum = 0; int _lo = _dp[0], _hi = _dp[0]; \
+        for (int _i = 0; _i < (int)_dn; _i++) { \
+            _sum += _dp[_i]; \
+            if (_dp[_i] < _lo) _lo = _dp[_i]; \
+            if (_dp[_i] > _hi) _hi = _dp[_i]; \
+        } \
+        printf(" : sum=%lld : min=%d : max=%d", (long long)_sum, _lo, _hi); \
+    } else if (_dt == 0) { /* Byte / uint8 */ \
+        const uint8_t* _dp = (tensor).const_data_ptr<uint8_t>(); \
+        int _k = (int)_dn < _DUMP_N ? (int)_dn : _DUMP_N; \
+        printf(" : first=["); \
+        for (int _i = 0; _i < _k; _i++) printf("%s%u", _i?",":"", (unsigned)_dp[_i]); \
+        printf("]"); \
+        int64_t _sum = 0; int _lo = _dp[0], _hi = _dp[0]; \
+        for (int _i = 0; _i < (int)_dn; _i++) { \
+            _sum += _dp[_i]; \
+            if ((int)_dp[_i] < _lo) _lo = _dp[_i]; \
+            if ((int)_dp[_i] > _hi) _hi = _dp[_i]; \
+        } \
+        printf(" : sum=%lld : min=%d : max=%d", (long long)_sum, _lo, _hi); \
+    } else { \
+        printf(" : (unsupported dtype)"); \
+    } \
+    printf("\n"); \
+} while(0)
diff --git a/backends/cadence/vision/third-party/include/lib.h b/backends/cadence/vision/third-party/include/lib.h
new file mode 100644
index 00000000000..4a7e31ee92b
--- /dev/null
+++ b/backends/cadence/vision/third-party/include/lib.h
@@ -0,0 +1,72 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ('Cadence    */
+/* Libraries') are the copyrighted works of Cadence Design Systems Inc.     */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+
+#ifndef __LIB_H__
+#define __LIB_H__
+
+#include "dtypes.h"
+#include "api.h"
+#include <stdio.h>
+
+#include "dma.h"
+#include "memory_manager.h"
+#include "utils.h"
+
+#if defined COMPILER_XTENSA
+
+#include <xtensa/config/core-isa.h>
+#include <xtensa/tie/xt_ivpn.h>
+#define IVP_SIMD_WIDTH XCHAL_IVPN_SIMD_WIDTH
+
+// Performance measurement macros
+#define XTPERF_PRINTF(...) printf(__VA_ARGS__)
+#define TIME_DECL(test) long start_time_##test, end_time_##test;
+#define TIME_START(test) { start_time_##test = XT_RSR_CCOUNT(); }
+#define TIME_END(test) { end_time_##test = XT_RSR_CCOUNT(); }
+#define TIME_DISPLAY(test, opcnt, opname) { long long cycles_##test = end_time_##test - start_time_##test; \
+		XTPERF_PRINTF("PERF_LOG : %s : %d : %s : %lld : cycles : %.2f : %s/cycle : %.2f : cycles/%s\n", \
+		       #test, opcnt, opname, cycles_##test, cycles_##test == 0 ? 0 : (double)(opcnt)/cycles_##test, \
+           opname, cycles_##test == 0 ? 0 : 1/((double)(opcnt)/cycles_##test), opname); }
+
+
+// // IDMA Initializations and declarations
+// #if XCHAL_HAVE_IDMA
+// #ifndef IDMA_USE_MULTICHANNEL
+//   #define IDMA_USE_MULTICHANNEL 1
+// #endif
+// #ifndef CHL_MAX
+//   #define CHL_MAX 2
+// #endif
+// #include <xtensa/idma.h>
+// #endif
+
+// #ifndef DRAM0_BUFF_SIZE // To be defined at compile time
+//   #error "DRAM0_BUFF_SIZE not defined"
+// #endif
+
+// #ifndef DRAM1_BUFF_SIZE // To be defined at compile time
+//   #error "DRAM1_BUFF_SIZE not defined"
+// #endif
+
+// #ifndef PLACE_IN_DRAM0
+// 	#define PLACE_IN_DRAM0 __attribute__ ((aligned(2*IVP_SIMD_WIDTH), section(".dram0.data")))
+// #endif
+
+// #ifndef PLACE_IN_DRAM1
+// 	#define PLACE_IN_DRAM1 __attribute__ ((aligned(2*IVP_SIMD_WIDTH), section(".dram1.data")))
+// #endif
+
+extern void *ptr_dram0;
+extern void *ptr_dram1;
+
+#endif // COMPILER_XTENSA
+
+#endif // __LIB_H__
\ No newline at end of file
diff --git a/backends/cadence/vision/third-party/include/memory_manager.h b/backends/cadence/vision/third-party/include/memory_manager.h
new file mode 100644
index 00000000000..5430d075042
--- /dev/null
+++ b/backends/cadence/vision/third-party/include/memory_manager.h
@@ -0,0 +1,69 @@
+/*
+ * memory_manager.h
+ *
+ *  Created on: Nov 6, 2025
+ *      Author: sraut
+ *
+ *  Description: Dynamic memory allocator for DRAM0, DRAM1, and local SRAM regions
+ *               Provides simple arena-style allocation with 64-byte alignment
+ */
+
+#ifndef MEMORY_MANAGER_H_
+#define MEMORY_MANAGER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include "../../operators/layer_configs.h"  // For IDMA_BUFFER_SIZE_DRAM0/DRAM1
+
+// ============================================================================
+// Memory Configuration
+// ============================================================================
+
+// Cache-mode padded input buffer size (in system memory)
+// Must fit the largest padded input tensor for cache-mode layers
+// For ResNet layers that don't fit in DRAM tiling (e.g., 56x56x128)
+#ifndef CACHE_PADDED_INPUT_SIZE
+#define CACHE_PADDED_INPUT_SIZE  (1024 * 1024)  // 1 MB max
+#endif
+
+// ============================================================================
+// Dynamic Memory Allocator for DRAM0 and DRAM1
+// ============================================================================
+
+// Memory pools placed in specific DRAM sections
+// Declared extern here, defined in memory_manager.c
+extern uint8_t dram0_pool[IDMA_BUFFER_SIZE_DRAM0];
+extern uint8_t dram1_pool[IDMA_BUFFER_SIZE_DRAM1];
+
+// Cache-mode padded input buffer (in system memory)
+// Used by cache-mode kernels for edge padding
+extern int8_t cache_padded_input[CACHE_PADDED_INPUT_SIZE];
+
+/**
+ * @brief Allocate DRAM buffer with SIMD alignment
+ * @param size Size in bytes to allocate
+ * @param dram_bank Which DRAM bank (0 or 1)
+ * @param dram0_used Pointer to current dram0 usage counter
+ * @param dram1_used Pointer to current dram1 usage counter
+ * @return Pointer to allocated buffer
+ */
+int8_t* allocate_dram_buffer(int size, int dram_bank, int* dram0_used, int* dram1_used);
+
+/**
+ * @brief Get pointer to cache-mode padded input buffer
+ * @return Pointer to the padded input buffer (aligned, in system memory)
+ */
+static inline int8_t* get_cache_padded_input(void) {
+    return cache_padded_input;
+}
+
+/**
+ * @brief Get size of cache-mode padded input buffer
+ * @return Size in bytes
+ */
+static inline size_t get_cache_padded_input_size(void) {
+    return CACHE_PADDED_INPUT_SIZE;
+}
+
+#endif /* MEMORY_MANAGER_H_ */
diff --git a/backends/cadence/vision/third-party/include/utils.h b/backends/cadence/vision/third-party/include/utils.h
new file mode 100644
index 00000000000..eb659c291c8
--- /dev/null
+++ b/backends/cadence/vision/third-party/include/utils.h
@@ -0,0 +1,182 @@
+/*
+ * utils.h
+ *
+ *  Created on: Nov 4, 2025
+ *      Author: sraut
+ */
+
+#ifndef UTILS_H_
+#define UTILS_H_
+
+#include <stdint.h>
+#include <xtensa/tie/xt_ivpn.h>
+#include "../libxai_common/include/xai_tile_manager.h"
+
+
+/**
+ * @brief Increment iterator to temp with carry
+ * @param temp Pointer to temporary variable
+ * @param var Current value
+ * @param bound Upper bound
+ * @param carry Carry value
+ * @return New carry value
+ */
+
+// required for windows
+#undef min
+#undef max
+static inline int min(int a, int b) { return a < b ? a : b; }
+static inline int max(int a, int b) { return a > b ? a : b; }
+
+
+static inline int inc_iter_to_temp(int *temp, int var, int bound, int carry) {
+    int new_val = var + carry;
+    carry = new_val == bound;
+    *temp = carry ? 0 : new_val;
+    return carry;
+}
+
+/**
+ * @brief Swap two uint8_t buffer pointers
+ * @param a Pointer to first buffer pointer
+ * @param b Pointer to second buffer pointer
+ */
+static inline void swap_buffers(int8_t **a, int8_t **b) {
+    int8_t *t = *a;
+    *a = *b;
+    *b = t;
+}
+
+static inline void _proto_FillBuffer_I8(void *buff, int val, unsigned size) {
+
+  unsigned its = size / (2 * XCHAL_IVPN_SIMD_WIDTH);
+  unsigned rem = size % (2 * XCHAL_IVPN_SIMD_WIDTH);
+  xb_vec2Nx8 *pDst = (xb_vec2Nx8 *)buff;
+  valign vaDst = IVP_ZALIGN();
+  xb_vec2Nx8 pattern = IVP_MOVVA8(val);
+  for (unsigned i = 0; i < its; i++) {
+    IVP_SAV2NX8_XP(pattern, vaDst, pDst, 2 * XCHAL_IVPN_SIMD_WIDTH);
+  }
+  IVP_SAV2NX8_XP(pattern, vaDst, pDst, rem);
+  IVP_SAPOS2NX8_FP(vaDst, pDst);
+}
+
+/**
+ * @brief Setup a tile3D descriptor for cache-mode input tile
+ * 
+ * This initializes a tile3D structure pointing to a local buffer with
+ * proper dimensions, edges, and pitches for convolution operations.
+ * Used by cache-mode executors where input is copied to SRAM scratch buffer.
+ * 
+ * @param tile Pointer to tile3D descriptor to initialize
+ * @param buffer Pointer to data buffer (in SRAM)
+ * @param dim1_size Width without padding
+ * @param dim2_size Height without padding
+ * @param dim3_size Channels
+ * @param edge1 Edge padding on left/top
+ * @param edge2 Edge padding on right/bottom
+ * @param stride_alignment Pitch alignment (typically 2*XCHAL_IVPN_SIMD_WIDTH)
+ */
+static inline void setup_tile3d_cache_input(
+    xai_tile3D* tile,
+    int8_t* buffer,
+    int dim1_size,        // Width (W)
+    int dim2_size,        // Height (H)
+    int dim3_size,        // Channels (D)
+    int dim1_edge1,       // Left edge
+    int dim1_edge2,       // Right edge
+    int dim2_edge1,       // Top edge
+    int dim2_edge2,       // Bottom edge
+    int dim3_edge1,       // Channel edge start
+    int dim3_edge2,       // Channel edge end
+    int stride_alignment  // Pitch alignment
+) {
+    // Calculate padded dimensions
+    int padded_dim1 = dim1_size + dim1_edge1 + dim1_edge2;
+    int padded_dim2 = dim2_size + dim2_edge1 + dim2_edge2;
+    int padded_dim3 = dim3_size + dim3_edge1 + dim3_edge2;
+    
+    // Calculate aligned pitch for dim1
+    int dim1_pitch = padded_dim1;
+    if (stride_alignment > 0) {
+        dim1_pitch = (padded_dim1 + stride_alignment - 1) & ~(stride_alignment - 1);
+    }
+    
+    // Calculate pitch for dim2
+    int dim2_pitch = dim1_pitch * padded_dim2;
+    
+    // Calculate total buffer size
+    int buffer_size = dim2_pitch * padded_dim3;
+    
+    // Initialize tile descriptor
+    XAI_TILE3D_SET_BUFF_PTR(tile, buffer);
+    XAI_TILE3D_SET_BUFF_SIZE(tile, buffer_size);
+    XAI_TILE3D_SET_DATA_PTR(tile, buffer + (dim3_edge1 * dim2_pitch) + 
+                                           (dim2_edge1 * dim1_pitch) + 
+                                           dim1_edge1);
+    XAI_TILE3D_SET_DATA_ORDER(tile, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(tile, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(tile, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(tile, 0);
+    
+    // Set dimensions
+    XAI_TILE3D_SET_DIM1(tile, dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(tile, dim1_edge1);
+    XAI_TILE3D_SET_DIM1_EDGE2(tile, dim1_edge2);
+    XAI_TILE3D_SET_DIM1_PITCH(tile, dim1_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(tile, 0);
+    
+    XAI_TILE3D_SET_DIM2(tile, dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(tile, dim2_edge1);
+    XAI_TILE3D_SET_DIM2_EDGE2(tile, dim2_edge2);
+    XAI_TILE3D_SET_DIM2_PITCH(tile, dim2_pitch);
+    XAI_TILE3D_SET_DIM2_COORD(tile, 0);
+    
+    XAI_TILE3D_SET_DIM3(tile, dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(tile, dim3_edge1);
+    XAI_TILE3D_SET_DIM3_EDGE2(tile, dim3_edge2);
+    XAI_TILE3D_SET_DIM3_COORD(tile, 0);
+}
+
+/**
+ * @brief Setup source tile descriptor for raw input data (before copy)
+ * 
+ * Used to describe the source input data in system memory before
+ * copying to the padded SRAM tile.
+ */
+static inline void setup_tile3d_source(
+    xai_tile3D* tile,
+    int8_t* buffer,
+    int dim1_size,
+    int dim2_size,
+    int dim3_size,
+    int dim1_pitch,
+    int dim2_pitch
+) {
+    XAI_TILE3D_SET_BUFF_PTR(tile, buffer);
+    XAI_TILE3D_SET_BUFF_SIZE(tile, dim2_pitch * dim3_size);
+    XAI_TILE3D_SET_DATA_PTR(tile, buffer);
+    XAI_TILE3D_SET_DATA_ORDER(tile, XAI_WHD);
+    XAI_TILE3D_SET_TYPE(tile, XAI_TILE3D_S8);
+    XAI_TILE3D_SET_FRAME_PTR(tile, 0);
+    XAI_TILE3D_SET_STATUS_FLAGS(tile, 0);
+    
+    XAI_TILE3D_SET_DIM1(tile, dim1_size);
+    XAI_TILE3D_SET_DIM1_EDGE1(tile, 0);
+    XAI_TILE3D_SET_DIM1_EDGE2(tile, 0);
+    XAI_TILE3D_SET_DIM1_PITCH(tile, dim1_pitch);
+    XAI_TILE3D_SET_DIM1_COORD(tile, 0);
+    
+    XAI_TILE3D_SET_DIM2(tile, dim2_size);
+    XAI_TILE3D_SET_DIM2_EDGE1(tile, 0);
+    XAI_TILE3D_SET_DIM2_EDGE2(tile, 0);
+    XAI_TILE3D_SET_DIM2_PITCH(tile, dim2_pitch);
+    XAI_TILE3D_SET_DIM2_COORD(tile, 0);
+    
+    XAI_TILE3D_SET_DIM3(tile, dim3_size);
+    XAI_TILE3D_SET_DIM3_EDGE1(tile, 0);
+    XAI_TILE3D_SET_DIM3_EDGE2(tile, 0);
+    XAI_TILE3D_SET_DIM3_COORD(tile, 0);
+}
+
+#endif /* UTILS_H_ */
diff --git a/backends/cadence/vision/third-party/include_private/common.h b/backends/cadence/vision/third-party/include_private/common.h
index 4fc07d8b4d1..e80e5e3775a 100644
--- a/backends/cadence/vision/third-party/include_private/common.h
+++ b/backends/cadence/vision/third-party/include_private/common.h
@@ -33,19 +33,10 @@
 #include <xtensa/tie/xt_core.h>
 #include <xtensa/tie/xt_density.h>
 #include <xtensa/tie/xt_misc.h>
-#if XCHAL_HAVE_IDMA
-#ifndef IDMA_USE_MULTICHANNEL
-  #define IDMA_USE_MULTICHANNEL 1
-#endif
-#include <xtensa/idma.h>
-#endif
 #define IVP_SIMD_WIDTH XCHAL_IVPN_SIMD_WIDTH
 
 #include "xtensa/config/core-isa.h"
 #include "xtensa/tie/xt_ivpn.h"
-#if XCHAL_HAVE_IDMA
-#include "xtensa/idma.h"
-#endif
 
 #ifdef _MSC_VER
 #define ALIGN(x) _declspec(align(x))
@@ -70,16 +61,6 @@
 #define restrict_clang
 #endif
 
-// Performance measurement macros
-#define XTPERF_PRINTF(...) printf(__VA_ARGS__)
-#define TIME_DECL(test) long start_time_##test, end_time_##test;
-#define TIME_START(test) { start_time_##test = 0;   XT_WSR_CCOUNT(0); }
-#define TIME_END(test) { end_time_##test = XT_RSR_CCOUNT(); }
-#define TIME_DISPLAY(test, opcnt, opname) { long long cycles_##test = end_time_##test - start_time_##test; \
-		XTPERF_PRINTF("PERF_LOG : %s : %d : %s : %lld : cycles : %.2f : %s/cycle : %.2f : cycles/%s\n", \
-		       #test, opcnt, opname, cycles_##test, cycles_##test == 0 ? 0 : (double)(opcnt)/cycles_##test, \
-           opname, cycles_##test == 0 ? 0 : 1/((double)(opcnt)/cycles_##test), opname); }
-
 //-----------------------------------------------------
 // log2(BBE_SIMD_WIDTH)
 //-----------------------------------------------------
@@ -190,6 +171,21 @@
 #define HAVE_32X32 0
 #endif
 
+/*------ INSTRUCTION EMULATIONS ------*/
+
+#ifndef IVP_ADDSN_2X32
+#define IVP_ADDSN_2X32(b_, c_)                                                 \
+  ({                                                                           \
+    xb_vecN_2x32v a_;                                                          \
+    xb_vecN_2x64w tmp_a_;                                                      \
+    tmp_a_ = IVP_MULN_2X32(b_, 1);                                             \
+    IVP_MULAN_2X32(tmp_a_, c_, 1);                                             \
+    a_ = IVP_PACKVRN_2X64W(tmp_a_, 0);                                         \
+    a_;                                                                        \
+  })
+#endif
+
+
 #ifdef __cplusplus
 #define externC extern "C"
 #else
diff --git a/backends/cadence/vision/third-party/include_private/idma_init.h b/backends/cadence/vision/third-party/include_private/idma_init.h
deleted file mode 100644
index a885bdf6086..00000000000
--- a/backends/cadence/vision/third-party/include_private/idma_init.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef __IDMA__INIT_H__
-#define __IDMA__INIT_H__
-
-#include "../include/dtypes.h"
-#include "common.h"
-
- // 4 kb x sizeof(float32_t) = 16 kb DRAM storage. Assume 4 buffers (2 input and 2 output)
-#define IDMA_BUFF_SIZE 4096
-
-#ifndef PLACE_IN_DRAM0
-#define PLACE_IN_DRAM0 \
-  __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram0.data")))
-#endif
-
-#ifndef PLACE_IN_DRAM1
-#define PLACE_IN_DRAM1 \
-  __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram1.data")))
-#endif
-
-float32_t data_dram0[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM0;
-float32_t data_dram1[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM1;
-
-float32_t* inpData[2] = {&data_dram0[0], &data_dram1[0]};
-float32_t* outData[2] = {
-    &data_dram0[IDMA_BUFF_SIZE / 4],
-    &data_dram1[IDMA_BUFF_SIZE / 4]};
-
-IDMA_BUFFER_DEFINE(buffer_idma_ch0, 1, IDMA_2D_DESC);
-IDMA_BUFFER_DEFINE(buffer_idma_ch1, 1, IDMA_2D_DESC);
-
-idma_buffer_t* descbuf[] = {
-    buffer_idma_ch0,
-    buffer_idma_ch1,
-};
-
-#endif // __IDMA__INIT_H__
diff --git a/backends/cadence/vision/third-party/library/api/dequantize.c b/backends/cadence/vision/third-party/library/api/dequantize.c
new file mode 100644
index 00000000000..98b707887bd
--- /dev/null
+++ b/backends/cadence/vision/third-party/library/api/dequantize.c
@@ -0,0 +1,81 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ('Cadence    */
+/* Libraries') are the copyrighted works of Cadence Design Systems Inc.     */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+
+#include "api.h"
+#include "common.h"
+
+#if !HAVE_VFPU
+DISCARD_FUN(void, dequantize_asym8s_f32, (float32_t *restrict ptr_out
+  ,const int8_t *restrict ptr_inp
+  ,float32_t scale
+  ,int zero_bias
+  ,int N))
+#else
+void dequantize_asym8s_f32(float32_t *restrict ptr_out
+  ,const int8_t *restrict ptr_inp
+  ,float32_t scale
+  ,int zero_bias
+  ,int N)
+{
+  // Inputs
+  xb_vecNx8 *p_i = (xb_vecNx8 *)ptr_inp;
+  xb_vecN_2xf32 *p_o = (xb_vecN_2xf32 *)ptr_out;
+
+  // Loop index
+  int n;
+
+  // Alignment variables
+  valign al_i = IVP_LANX8S_PP(p_i);
+  valign al_o = IVP_ZALIGN();
+  
+  for (n = 0; n < (N >> LOG2_IVP_SIMD_WIDTH); n++)
+  {
+    xb_vecNx16 inp;
+    xb_vecN_2x32v inp1_bias, inp2_bias;
+    xb_vecN_2xf32 out1, out2;
+
+    IVP_LANX8S_XP(inp, al_i, p_i, IVP_SIMD_WIDTH);
+
+    inp1_bias = IVP_UNPKSNX16_L(inp);
+    inp2_bias = IVP_UNPKSNX16_H(inp);
+
+    inp1_bias = IVP_SUBN_2X32(inp1_bias, (xb_vecN_2x32v) zero_bias);
+    out1 = IVP_MULN_2XF32(scale, (xb_vecN_2xf32) inp1_bias);
+
+    inp2_bias = IVP_SUBN_2X32(inp2_bias, (xb_vecN_2x32v) zero_bias);
+    out2 = IVP_MULN_2XF32(scale, (xb_vecN_2xf32) inp2_bias);
+
+    IVP_SAN_2XF32_IP(out1, al_o, p_o);
+    IVP_SAN_2XF32_IP(out2, al_o, p_o);
+  }
+  if (N & (IVP_SIMD_WIDTH - 1)) // Check if there are remaining elements
+  {
+    xb_vecNx16 inp;
+    xb_vecN_2x32v inp1_bias, inp2_bias;
+    xb_vecN_2xf32 out1, out2;
+
+    IVP_LANX8S_XP(inp, al_i, p_i, N & (IVP_SIMD_WIDTH - 1));
+
+    inp1_bias = IVP_UNPKSNX16_L(inp);
+    inp2_bias = IVP_UNPKSNX16_H(inp);
+
+    inp1_bias = IVP_SUBN_2X32(inp1_bias, (xb_vecN_2x32v) zero_bias);
+    out1 = IVP_MULN_2XF32(scale, (xb_vecN_2xf32) inp1_bias);
+
+    inp2_bias = IVP_SUBN_2X32(inp2_bias, (xb_vecN_2x32v) zero_bias);
+    out2 = IVP_MULN_2XF32(scale, (xb_vecN_2xf32) inp2_bias);
+
+    IVP_SAVN_2XF32_XP(out1, al_o, p_o, 4 * (N & (IVP_SIMD_WIDTH - 1)));
+    IVP_SAVN_2XF32_XP(out2, al_o, p_o, 4 * ((N & (IVP_SIMD_WIDTH - 1)) - (IVP_SIMD_WIDTH >> 1)));
+  }
+  IVP_SAPOSN_2XF32_FP(al_o, p_o);
+}
+#endif
\ No newline at end of file
diff --git a/backends/cadence/vision/third-party/library/api/maxpool2df.c b/backends/cadence/vision/third-party/library/api/maxpool2df.c
new file mode 100644
index 00000000000..76faac55ea6
--- /dev/null
+++ b/backends/cadence/vision/third-party/library/api/maxpool2df.c
@@ -0,0 +1,248 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ('Cadence    */
+/* Libraries') are the copyrighted works of Cadence Design Systems Inc.     */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+
+#include "api.h"
+#include "common.h"
+
+#if !HAVE_VFPU
+DISCARD_FUN(void, maxpool2d_with_indices_j2x2_f32, (float32_t *restrict ptr_out
+  ,const float32_t *restrict ptr_inp
+  ,int *restrict ptr_indices
+  ,int inp_height ,int inp_width
+  ,int out_height ,int out_width
+  ,int32_t in_pitch_width, int32_t in_pitch_height
+  ,int32_t out_pitch_width, int32_t out_pitch_height
+  ,uint8_t kernel_height
+  ,uint8_t kernel_width))
+
+DISCARD_FUN(void, maxpool2d_j2x2_f32, (float32_t *restrict ptr_out
+  ,const float32_t *restrict ptr_inp
+  ,int inp_height ,int inp_width
+  ,int out_height ,int out_width
+  ,int32_t in_pitch_width, int32_t in_pitch_height
+  ,int32_t out_pitch_width, int32_t out_pitch_height
+  ,uint8_t kernel_height
+  ,uint8_t kernel_width))
+#else
+void maxpool2d_with_indices_j2x2_f32(float32_t *restrict ptr_out
+  ,const float32_t *restrict ptr_inp
+  ,int *restrict ptr_indices
+  ,int inp_height ,int inp_width
+  ,int out_height ,int out_width
+  ,int32_t in_pitch_width, int32_t in_pitch_height
+  ,int32_t out_pitch_width, int32_t out_pitch_height
+  ,uint8_t kernel_height
+  ,uint8_t kernel_width)
+{
+  const int32_t out_increment = ((IVP_SIMD_WIDTH - kernel_width) / 2) + 1;
+
+  int32_t x, y, kx, ky;
+  int32_t remX, remXLoad;
+
+  xb_vecN_2xf32* restrict pdvecOut;
+  xb_vecN_2x32v* restrict pdvecIdx;
+  xb_vecN_2xf32* restrict pdvecIn;
+
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2xf32 dvecMax1;
+  xb_vecN_2xf32 dvecMax11, dvecMax12;
+  xb_vecN_2xf32 dvecData11, dvecData12;
+  xb_vecN_2x32v dvecKxIdx1, dvecKyIdx1;
+  xb_vecN_2x32v dvecKyIdx11, dvecKyIdx12;
+  xb_vecN_2x32v dvecIdx1;
+
+  vboolN_2 dboolGT, dboolEq;
+  vboolN_2 dboolkyIdxLT;
+  xb_vecN_2x32v dvecGTKyIdx, dvecEQKyIdx;
+  xb_vecN_2x32v dvecGTKxIdx, dvecEQKxIdx;
+
+  vboolN_2 dvbKernelType = IVP_EQN_2X32((kernel_width % 2), 0);
+
+  for (x = 0; x < out_width; x += out_increment) {
+    remX = XT_MIN(out_width - x, out_increment);
+    remXLoad = ((2 * (remX - 1) + kernel_width) > (IVP_SIMD_WIDTH / 2)) ? 1 : 0;
+    int32_t remXOffset = remXLoad * (IVP_SIMD_WIDTH / 2);
+
+    for (y = 0; y < out_height; y++) {
+      float* pOut = &ptr_out[y * out_pitch_width + x];
+      int32_t* pIdx = &ptr_indices[y * out_pitch_width + x];
+      const float* pSrc = ptr_inp + y * in_pitch_width * 2 + x * 2;
+      pdvecIn = (xb_vecN_2xf32*) pSrc;
+
+      // Initialize max values
+      dvecMax1 = MIN_FLT32;
+      dvecMax11 = dvecMax12 = dvecMax1;
+
+      // Initialize index tracking
+      dvecKxIdx1 = 0;
+      dvecKyIdx1 = 0;
+      dvecKyIdx11 = dvecKyIdx12 = 0;
+
+      // ========== KERNEL HEIGHT COMPARISONS ==========
+      for (ky = 0; ky < kernel_height; ky++) {
+        IVP_L2UN_2XF32_XP(dvecData11, pdvecIn, remXOffset * sizeof(float));
+        IVP_L2UN_2XF32_XP(dvecData12, pdvecIn, (in_pitch_width - remXOffset) * sizeof(float));
+
+        dboolGT = IVP_OGTN_2XF32(dvecData11, dvecMax11);
+        dvecMax11 = IVP_MAXN_2XF32(dvecMax11, dvecData11);
+        dvecKyIdx11 = IVP_MOVN_2X32T(ky, dvecKyIdx11, dboolGT);
+
+        dboolGT = IVP_OGTN_2XF32(dvecData12, dvecMax12);
+        dvecMax12 = IVP_MAXN_2XF32(dvecMax12, dvecData12);
+        dvecKyIdx12 = IVP_MOVN_2X32T(ky, dvecKyIdx12, dboolGT);
+      }
+
+      IVP_DSELN_2XF32I(dvecMax12, dvecMax11, dvecMax12, dvecMax11, IVP_DSELI_32B_DEINTERLEAVE_1);
+      IVP_DSELN_2X32I(dvecKyIdx12, dvecKyIdx11, dvecKyIdx12, dvecKyIdx11, IVP_DSELI_32B_DEINTERLEAVE_1);
+
+      // ========== KERNEL WIDTH COMPARISONS ==========
+      for (kx = 0; kx < kernel_width - 1; kx += 2) {
+        // First comparison
+        dboolEq = IVP_OEQN_2XF32(dvecMax11, dvecMax1);
+        dboolGT = IVP_OGTN_2XF32(dvecMax11, dvecMax1);
+        dvecMax1 = IVP_MAXN_2XF32(dvecMax1, dvecMax11);
+
+        dvecGTKyIdx = IVP_MOVN_2X32T(dvecKyIdx11, dvecKyIdx1, dboolGT);
+        dvecEQKyIdx = IVP_MOVN_2X32T(dvecKyIdx11, dvecKyIdx1, dboolEq);
+        dvecKyIdx1 = IVP_MOVN_2X32T(IVP_MINN_2X32(dvecGTKyIdx, dvecEQKyIdx), dvecGTKyIdx, dboolEq);
+
+        dvecGTKxIdx = IVP_MOVN_2X32T(kx, dvecKxIdx1, dboolGT);
+        dvecEQKxIdx = IVP_MOVN_2X32T(kx, dvecKxIdx1, dboolEq);
+        dboolkyIdxLT = IVP_LTN_2X32(dvecKyIdx1, dvecGTKyIdx);
+        dvecKxIdx1 = IVP_MOVN_2X32T(IVP_MOVN_2X32T(dvecEQKxIdx, dvecGTKxIdx, dboolkyIdxLT), dvecGTKxIdx, dboolEq);
+
+        dvecMax11 = IVP_SELN_2XF32I(MIN_FLT32, dvecMax11, IVP_SELI_32B_ROTATE_RIGHT_1);
+        dvecKyIdx11 = IVP_SELN_2X32I(0, dvecKyIdx11, IVP_SELI_32B_ROTATE_RIGHT_1);
+
+        // Second comparison
+        dboolEq = IVP_OEQN_2XF32(dvecMax12, dvecMax1);
+        dboolGT = IVP_OGTN_2XF32(dvecMax12, dvecMax1);
+        dvecMax1 = IVP_MAXN_2XF32(dvecMax1, dvecMax12);
+
+        dvecGTKyIdx = IVP_MOVN_2X32T(dvecKyIdx12, dvecKyIdx1, dboolGT);
+        dvecEQKyIdx = IVP_MOVN_2X32T(dvecKyIdx12, dvecKyIdx1, dboolEq);
+        dvecKyIdx1 = IVP_MOVN_2X32T(IVP_MINN_2X32(dvecGTKyIdx, dvecEQKyIdx), dvecGTKyIdx, dboolEq);
+
+        dvecGTKxIdx = IVP_MOVN_2X32T((kx + 1), dvecKxIdx1, dboolGT);
+        dvecEQKxIdx = IVP_MOVN_2X32T((kx + 1), dvecKxIdx1, dboolEq);
+        dboolkyIdxLT = IVP_LTN_2X32(dvecKyIdx1, dvecGTKyIdx);
+        dvecKxIdx1 = IVP_MOVN_2X32T(IVP_MOVN_2X32T(dvecEQKxIdx, dvecGTKxIdx, dboolkyIdxLT), dvecGTKxIdx, dboolEq);
+
+        dvecMax12 = IVP_SELN_2XF32I(MIN_FLT32, dvecMax12, IVP_SELI_32B_ROTATE_RIGHT_1);
+        dvecKyIdx12 = IVP_SELN_2X32I(0, dvecKyIdx12, IVP_SELI_32B_ROTATE_RIGHT_1);
+      }
+
+      // final comparison if kernel_width is odd
+      xb_vecN_2xf32 dvecMaxTest = IVP_MOVN_2XF32T(dvecMax1, dvecMax11, dvbKernelType);
+
+      dboolEq = IVP_OEQN_2XF32(dvecMaxTest, dvecMax1);
+      dboolGT = IVP_OGTN_2XF32(dvecMaxTest, dvecMax1);
+      dvecMax1 = IVP_MAXN_2XF32(dvecMax1, dvecMaxTest);
+
+      dvecGTKyIdx = IVP_MOVN_2X32T(IVP_MOVN_2X32T(dvecKyIdx1, dvecKyIdx11, dvbKernelType), dvecKyIdx1, dboolGT);
+      dvecEQKyIdx = IVP_MOVN_2X32T(IVP_MOVN_2X32T(dvecKyIdx1, dvecKyIdx11, dvbKernelType), dvecKyIdx1, dboolEq);
+      dvecKyIdx1 = IVP_MOVN_2X32T(IVP_MINN_2X32(dvecGTKyIdx, dvecEQKyIdx), dvecGTKyIdx, dboolEq);
+
+      dvecGTKxIdx = IVP_MOVN_2X32T(kx, dvecKxIdx1, dboolGT);
+      dvecEQKxIdx = IVP_MOVN_2X32T(kx, dvecKxIdx1, dboolEq);
+      dboolkyIdxLT = IVP_LTN_2X32(dvecKyIdx1, dvecGTKyIdx);
+      dvecKxIdx1 = IVP_MOVN_2X32T(IVP_MOVN_2X32T(dvecEQKxIdx, dvecGTKxIdx, dboolkyIdxLT), dvecGTKxIdx, dboolEq);
+
+      dvecIdx1 = IVP_ORN_2X32(IVP_SLLIN_2X32(dvecKyIdx1, 4), dvecKxIdx1);
+
+      // ========== STORE OUTPUTS ==========
+      // Store max values
+      pdvecOut = (xb_vecN_2xf32*) pOut;
+      IVP_SAVN_2XF32_XP(dvecMax1, vaOutData, pdvecOut, remX * sizeof(float));
+      IVP_SAPOSN_2XF32_FP(vaOutData, pdvecOut);
+
+      // Store indices
+      pdvecIdx = (xb_vecN_2x32v*) pIdx;
+      IVP_SAVN_2X32_XP(dvecIdx1, vaOutData, pdvecIdx, remX * sizeof(int32_t));
+      IVP_SAPOSN_2X32_FP(vaOutData, pdvecIdx); 
+    }
+  }
+}
+
+void maxpool2d_j2x2_f32(float32_t *restrict ptr_out
+  ,const float32_t *restrict ptr_inp
+  ,int inp_height ,int inp_width
+  ,int out_height ,int out_width
+  ,int32_t in_pitch_width, int32_t in_pitch_height
+  ,int32_t out_pitch_width, int32_t out_pitch_height
+  ,uint8_t kernel_height
+  ,uint8_t kernel_width)
+{
+  const int32_t out_increment = ((IVP_SIMD_WIDTH - kernel_width) / 2) + 1;
+  int32_t x, y, kx, ky;
+  int32_t remX, remXLoad;
+
+  xb_vecN_2xf32* restrict pdvecOut;
+  xb_vecN_2xf32* restrict pdvecIn;
+
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2xf32 dvecMax1;
+  xb_vecN_2xf32 dvecMax11, dvecMax12;
+  xb_vecN_2xf32 dvecData11, dvecData12;
+
+  vboolN_2 dvbKernelType = IVP_EQN_2X32((kernel_width % 2), 0);
+
+  for (x = 0; x < out_width; x += out_increment) {
+    remX = XT_MIN(out_width - x, out_increment);
+    remXLoad = ((2 * (remX - 1) + kernel_width) > (IVP_SIMD_WIDTH / 2)) ? 1 : 0;
+    int32_t remXOffset = remXLoad * (IVP_SIMD_WIDTH / 2);
+
+    for (y = 0; y < out_height; y++) {
+      float* pOut = &ptr_out[y * out_pitch_width + x];
+      const float* pSrc = ptr_inp + y * in_pitch_width * 2 + x * 2;
+      pdvecIn = (xb_vecN_2xf32*) pSrc;
+
+      // Initialize max values
+      dvecMax1 = MIN_FLT32;
+      dvecMax11 = dvecMax12 = dvecMax1;
+
+      // ========== KERNEL HEIGHT COMPARISONS ==========
+      for (ky = 0; ky < kernel_height; ky++) {
+        IVP_L2UN_2XF32_XP(dvecData11, pdvecIn, remXOffset * sizeof(float));
+        IVP_L2UN_2XF32_XP(dvecData12, pdvecIn, (in_pitch_width - remXOffset) * sizeof(float));
+
+        dvecMax11 = IVP_MAXN_2XF32(dvecMax11, dvecData11);
+        dvecMax12 = IVP_MAXN_2XF32(dvecMax12, dvecData12);
+      }
+
+      IVP_DSELN_2XF32I(dvecMax12, dvecMax11, dvecMax12, dvecMax11, IVP_DSELI_32B_DEINTERLEAVE_1);
+
+      // ========== KERNEL WIDTH COMPARISONS ==========
+      for (kx = 0; kx < kernel_width - 1; kx += 2) {
+        // First comparison
+        dvecMax1 = IVP_MAXN_2XF32(dvecMax1, dvecMax11);
+        dvecMax11 = IVP_SELN_2XF32I(MIN_FLT32, dvecMax11, IVP_SELI_32B_ROTATE_RIGHT_1);
+
+        // Second comparison
+        dvecMax1 = IVP_MAXN_2XF32(dvecMax1, dvecMax12);
+        dvecMax12 = IVP_SELN_2XF32I(MIN_FLT32, dvecMax12, IVP_SELI_32B_ROTATE_RIGHT_1);
+      }
+
+      // final comparison if kernel_width is odd
+      xb_vecN_2xf32 dvecMaxTest = IVP_MOVN_2XF32T(dvecMax1, dvecMax11, dvbKernelType);
+      dvecMax1 = IVP_MAXN_2XF32(dvecMax1, dvecMaxTest);
+
+      // ========== STORE OUTPUTS ==========
+      // Store max values
+      pdvecOut = (xb_vecN_2xf32*) pOut;
+      IVP_SAVN_2XF32_XP(dvecMax1, vaOutData, pdvecOut, remX * sizeof(float));
+      IVP_SAPOSN_2XF32_FP(vaOutData, pdvecOut);
+    }
+  }
+}
+#endif /* HAVE_VFPU */
diff --git a/backends/cadence/vision/third-party/library/api/mean.c b/backends/cadence/vision/third-party/library/api/mean.c
new file mode 100644
index 00000000000..01c528333c8
--- /dev/null
+++ b/backends/cadence/vision/third-party/library/api/mean.c
@@ -0,0 +1,110 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ('Cadence    */
+/* Libraries') are the copyrighted works of Cadence Design Systems Inc.     */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+
+/*-------------------------------------------------------------------------
+  SIMD Mean Pooling Operations
+  
+  This module implements optimized mean pooling operations using Xtensa
+  Vision DSP SIMD intrinsics for float32 data.
+-------------------------------------------------------------------------*/
+
+#include <xtensa/tie/xt_ivpn.h>
+#include <stdint.h>
+#include "api.h"
+#include "common.h"
+typedef float float32_t;
+
+#ifndef IVP_SIMD_WIDTH
+#define IVP_SIMD_WIDTH XCHAL_IVPN_SIMD_WIDTH
+#endif
+
+
+
+/*-------------------------------------------------------------------------
+  SIMD Mean Pooling 2x2 -> 1x1
+  
+  Description: 
+  This function implements mean pooling across 2x2 spatial dimensions for
+  float32 data using Xtensa SIMD intrinsics.
+  
+  Input shape:  1 x C x 2 x 2 (batch=1, channels=C, height=2, width=2)
+  Output shape: 1 x C x 1 x 1 (batch=1, channels=C, height=1, width=1)
+  
+  Algorithm:
+  - Load 16 float32 elements at a time (4 channels x 2x2 spatial) in ONE vector
+  - For each channel, compute mean of 4 spatial values (2x2)
+  - Use SIMD vector operations for efficient computation
+  
+  With SIMD width N=32, xb_vecN_2xf32 holds 16 float32 values.
+  Single load gets all 16 values: ch0[0,0], ch0[0,1], ch0[1,0], ch0[1,1], 
+    ch1[0,0], ch1[0,1], ch1[1,0], ch1[1,1], ch2[0,0], ch2[0,1], ch2[1,0], ch2[1,1],
+    ch3[0,0], ch3[0,1], ch3[1,0], ch3[1,1]
+  
+  Then shuffle to group elements from same channel together,
+  sum them, and divide by 4 to get the mean.
+  
+  Parameters:
+  Input:
+    input[num_channels*4]   Input tensor in CHW format (channels, 2x2 spatial)
+    num_channels            Number of input channels
+  Output:
+    output[num_channels]    Output tensor (channels, 1x1 spatial)
+  
+  Restrictions:
+    - num_channels must be a multiple of 4
+    - input and output must be aligned to 64-byte boundary
+    - input and output must not overlap
+    
+-------------------------------------------------------------------------*/
+void simd_mean_pool_2x2_to_1x1_float32(float32_t* restrict output, 
+                                       const float32_t* restrict input,
+                                       int N) 
+{
+    int n;
+    xb_vecN_2xf32 vec0, vec1, vec2, vec3;
+    xb_vecN_2xf32 vec0_0, vec0_1, vec1_0, vec1_1;
+    xb_vecN_2xf32 v0, v1, v2, v3, sum_all, result;
+    const xb_vecN_2xf32* restrict pInput = (const xb_vecN_2xf32*)input;
+    xb_vecN_2xf32* restrict pOutput = (xb_vecN_2xf32*)output;
+    
+    if (N <= 0) return;
+   
+    __Pragma("no_reorder");
+    // __Pragma("loop_count min=1");
+    
+    for (n = 0; n < (N >> (LOG2_IVP_SIMD_WIDTH + 1)); n++) {
+        // Load 64 float32 values (4 vectors) - 16 channels × 4 values each
+        IVP_LVN_2XF32_IP(vec0, pInput, 2 * IVP_SIMD_WIDTH);  // 0-15
+        IVP_LVN_2XF32_IP(vec1, pInput, 2 * IVP_SIMD_WIDTH);  // 16-31
+        IVP_LVN_2XF32_IP(vec2, pInput, 2 * IVP_SIMD_WIDTH);  // 32-47
+        IVP_LVN_2XF32_IP(vec3, pInput, 2 * IVP_SIMD_WIDTH);  // 48-63
+        
+        // First level: Deinterleave vec0-vec1 and vec2-vec3 pairs (independent)
+        IVP_DSELN_2XF32I(vec0_0, vec0_1, vec1, vec0, IVP_DSELI_DEINTERLEAVE_2);
+        IVP_DSELN_2XF32I(vec1_0, vec1_1, vec3, vec2, IVP_DSELI_DEINTERLEAVE_2);
+        
+        // Second level: Cross-deinterleave directly to final vectors
+        IVP_DSELN_2XF32I(v2, v0, vec1_0, vec0_0, IVP_DSELI_DEINTERLEAVE_2);
+        IVP_DSELN_2XF32I(v3, v1, vec1_1, vec0_1, IVP_DSELI_DEINTERLEAVE_2);
+        
+        // v0=vec3_1 (stride-4, mod 0), v1=vec2_1 (stride-4, mod 1)
+        // v2=vec3_0 (stride-4, mod 2), v3=vec2_0 (stride-4, mod 3)
+        
+        // Fused add: ((v0 + v1) + (v2 + v3)) for better pipelining
+        sum_all = IVP_ADDN_2XF32(IVP_ADDN_2XF32(v0, v1), IVP_ADDN_2XF32(v2, v3));
+        
+        // Multiply by 0.25 to get mean
+        result = IVP_MULN_2XF32(sum_all, 0.25f);
+        
+        // Store result
+        IVP_SVN_2XF32_IP(result, pOutput, 2 * IVP_SIMD_WIDTH);
+    }
+}
diff --git a/backends/cadence/vision/third-party/library/api/quanitze_relu.c b/backends/cadence/vision/third-party/library/api/quanitze_relu.c
new file mode 100644
index 00000000000..97b0a19a654
--- /dev/null
+++ b/backends/cadence/vision/third-party/library/api/quanitze_relu.c
@@ -0,0 +1,112 @@
+
+#include "api.h"
+#include "common.h"
+#include <math.h>
+
+void vrelU_quantized(
+    int8_t* restrict ptr_out,
+    const int8_t* restrict ptr_inp,
+    int32_t in_zero_point,
+    int32_t out_zero_point,
+    float32_t out_scale,
+    int N)
+{
+  // Pointers
+  xb_vecNx8 *p_i = (xb_vecNx8 *)ptr_inp;
+  xb_vecNx8 *p_o = (xb_vecNx8 *)ptr_out;
+
+  // Loop index
+  int n;
+
+  // Alignment variables
+  valign al_i = IVP_LANX8S_PP(p_i);
+  valign al_o = IVP_ZALIGN();
+
+  // Constants
+  xb_vecN_2x32v zero_vec = 0;
+  xb_vecN_2x32v in_zp_vec = (xb_vecN_2x32v)in_zero_point;
+  xb_vecN_2xf32 out_zp_f32 = (xb_vecN_2xf32)(float32_t)out_zero_point;
+  xb_vecN_2xf32 min_val = (xb_vecN_2xf32)(-128.0f);
+  xb_vecN_2xf32 max_val = (xb_vecN_2xf32)(127.0f);
+
+  for (n = 0; n < (N >> LOG2_IVP_SIMD_WIDTH); n++)
+  {
+    xb_vecNx16 inp;
+    xb_vecN_2x32v temp1, temp2;
+    xb_vecN_2xf32 float1, float2;
+    xb_vecN_2xf32 result1, result2;
+    xb_vecNx16 out;
+
+    // Load int8 → sign-extend to 16-bit
+    IVP_LANX8S_XP(inp, al_i, p_i, IVP_SIMD_WIDTH);
+
+    // Unpack 16-bit → two 32-bit vectors (16 elements each)
+    temp1 = IVP_UNPKSNX16_L(inp);
+    temp2 = IVP_UNPKSNX16_H(inp);
+
+    // Integer operations: SUB in_zero_point
+    temp1 = IVP_SUBN_2X32(temp1, in_zp_vec);
+    temp2 = IVP_SUBN_2X32(temp2, in_zp_vec);
+
+    // ReLU: MAX(temp, 0)
+    temp1 = IVP_MAXN_2X32(temp1, zero_vec);
+    temp2 = IVP_MAXN_2X32(temp2, zero_vec);
+
+    // Convert int32 → float32 (implicit cast)
+    float1 = (xb_vecN_2xf32)temp1;
+    float2 = (xb_vecN_2xf32)temp2;
+
+    // FMA: out_zero_point + temp * out_scale
+    result1 = out_zp_f32;
+    IVP_MULAN_2XF32(result1, float1, out_scale);
+    result2 = out_zp_f32;
+    IVP_MULAN_2XF32(result2, float2, out_scale);
+
+    // Clamp to [-128, 127] and round to nearest integer
+    result1 = IVP_FIRINTN_2XF32(IVP_MAXN_2XF32(IVP_MINN_2XF32(result1, max_val), min_val));
+    result2 = IVP_FIRINTN_2XF32(IVP_MAXN_2XF32(IVP_MINN_2XF32(result2, max_val), min_val));
+
+    // Pack float → int16 → int8 (no explicit conversion needed)
+    out = IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32I(result2, result1, IVP_SELI_EXTRACT_1_OF_2_OFF_0));
+    IVP_SANX8S_IP(out, al_o, p_o);
+  }
+
+  // Handle remaining elements (tail)
+  if (N & (IVP_SIMD_WIDTH - 1))
+  {
+    xb_vecNx16 inp;
+    xb_vecN_2x32v temp1, temp2;
+    xb_vecN_2xf32 float1, float2;
+    xb_vecN_2xf32 result1, result2;
+    xb_vecNx16 out;
+
+    IVP_LANX8S_XP(inp, al_i, p_i, N & (IVP_SIMD_WIDTH - 1));
+
+    temp1 = IVP_UNPKSNX16_L(inp);
+    temp2 = IVP_UNPKSNX16_H(inp);
+
+    temp1 = IVP_SUBN_2X32(temp1, in_zp_vec);
+    temp2 = IVP_SUBN_2X32(temp2, in_zp_vec);
+
+    temp1 = IVP_MAXN_2X32(temp1, zero_vec);
+    temp2 = IVP_MAXN_2X32(temp2, zero_vec);
+
+    float1 = (xb_vecN_2xf32)temp1;
+    float2 = (xb_vecN_2xf32)temp2;
+
+    result1 = out_zp_f32;
+    IVP_MULAN_2XF32(result1, float1, out_scale);
+    result2 = out_zp_f32;
+    IVP_MULAN_2XF32(result2, float2, out_scale);
+
+    // Clamp to [-128, 127] and round to nearest integer
+    result1 = IVP_FIRINTN_2XF32(IVP_MAXN_2XF32(IVP_MINN_2XF32(result1, max_val), min_val));
+    result2 = IVP_FIRINTN_2XF32(IVP_MAXN_2XF32(IVP_MINN_2XF32(result2, max_val), min_val));
+
+    // Pack float → int16 → int8 (no explicit conversion needed)
+    out = IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32I(result2, result1, IVP_SELI_EXTRACT_1_OF_2_OFF_0));
+    IVP_SAVNX8S_XP(out, al_o, p_o, (N & (IVP_SIMD_WIDTH - 1)));
+  }
+
+  IVP_SAPOSNX8S_FP(al_o, p_o);
+}
diff --git a/backends/cadence/vision/third-party/library/api/quantizef.c b/backends/cadence/vision/third-party/library/api/quantizef.c
new file mode 100644
index 00000000000..7803a812f84
--- /dev/null
+++ b/backends/cadence/vision/third-party/library/api/quantizef.c
@@ -0,0 +1,79 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ('Cadence    */
+/* Libraries') are the copyrighted works of Cadence Design Systems Inc.     */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+
+#include "api.h"
+#include "common.h"
+
+#if !HAVE_VFPU
+DISCARD_FUN(void, quantize_f32_asym8s, (int8_t *restrict ptr_out
+  ,const float32_t *restrict ptr_inp
+  ,float32_t scale
+  ,int zero_bias
+  ,int N))
+#else
+void quantize_f32_asym8s(int8_t *restrict ptr_out
+  ,const float32_t *restrict ptr_inp
+  ,float32_t scale
+  ,int zero_bias
+  ,int N)
+{
+  // Inputs
+  xb_vecN_2xf32 *p_i = (xb_vecN_2xf32 *)ptr_inp;
+  xb_vecNx8 *p_o = (xb_vecNx8 *)ptr_out;
+  float32_t one_by_scaleF = (float32_t) (1.0f / scale);
+  float32_t one_by_scale = (one_by_scaleF > (float32_t) MAX_FLT32 ? (float32_t) MAX_FLT32 : (float32_t) (1.0f / scale));
+
+  // Loop index
+  int n;
+
+  // Alignment variables
+  valign al_i = IVP_LAN_2XF32_PP(p_i);
+  valign al_o = IVP_ZALIGN();
+  
+  for (n = 0; n < (N >> LOG2_IVP_SIMD_WIDTH); n++)
+  {
+    xb_vecN_2xf32 inp1, inp2;
+    xb_vecN_2xf32 inp1_scaled, inp2_scaled;
+    xb_vecN_2xf32 out1, out2;
+    xb_vecNx16 out;
+
+    IVP_LAN_2XF32_IP(inp1, al_i, p_i);
+    IVP_LAN_2XF32_IP(inp2, al_i, p_i);
+    inp1_scaled = (float32_t) zero_bias;
+    IVP_MULAN_2XF32(inp1_scaled, inp1, one_by_scale);
+    inp2_scaled = (float32_t) zero_bias;
+    IVP_MULAN_2XF32(inp2_scaled, inp2, one_by_scale);
+    out1 = IVP_FIRINTN_2XF32(IVP_MAXN_2XF32(IVP_MINN_2XF32(inp1_scaled, (xb_vecN_2xf32) MAX_INT8), (xb_vecN_2xf32) MIN_INT8));
+    out2 = IVP_FIRINTN_2XF32(IVP_MAXN_2XF32(IVP_MINN_2XF32(inp2_scaled, (xb_vecN_2xf32) MAX_INT8), (xb_vecN_2xf32) MIN_INT8));
+    out = IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32I(out2, out1, IVP_SELI_EXTRACT_1_OF_2_OFF_0));
+    IVP_SANX8S_IP(out, al_o, p_o);
+  }
+  if (N & (IVP_SIMD_WIDTH - 1))    // Check if there are remaining elements   
+  {
+    xb_vecN_2xf32 inp1, inp2;
+    xb_vecN_2xf32 inp1_scaled, inp2_scaled;
+    xb_vecN_2xf32 out1, out2;
+    xb_vecNx16 out;
+
+    IVP_LAVN_2XF32_XP(inp1, al_i, p_i, 4 * (N & (IVP_SIMD_WIDTH - 1)));
+    IVP_LAVN_2XF32_XP(inp2, al_i, p_i, 4 * ((N & (IVP_SIMD_WIDTH - 1)) - (IVP_SIMD_WIDTH >> 1)));
+    inp1_scaled = (float32_t) zero_bias;
+    IVP_MULAN_2XF32(inp1_scaled, inp1, one_by_scale);
+    inp2_scaled = (float32_t) zero_bias;
+    IVP_MULAN_2XF32(inp2_scaled, inp2, one_by_scale);
+    out1 = IVP_FIRINTN_2XF32(IVP_MAXN_2XF32(IVP_MINN_2XF32(inp1_scaled, (xb_vecN_2xf32) MAX_INT8), (xb_vecN_2xf32) MIN_INT8));
+    out2 = IVP_FIRINTN_2XF32(IVP_MAXN_2XF32(IVP_MINN_2XF32(inp2_scaled, (xb_vecN_2xf32) MAX_INT8), (xb_vecN_2xf32) MIN_INT8));
+    out = IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32I(out2, out1, IVP_SELI_EXTRACT_1_OF_2_OFF_0));
+    IVP_SAVNX8S_XP(out, al_o, p_o, (N & (IVP_SIMD_WIDTH - 1)));
+  }
+  IVP_SAPOSNX8S_FP(al_o, p_o);
+}
+#endif
\ No newline at end of file
diff --git a/backends/cadence/vision/third-party/library/api/vaddf.c b/backends/cadence/vision/third-party/library/api/vaddf.c
new file mode 100644
index 00000000000..2e64703a194
--- /dev/null
+++ b/backends/cadence/vision/third-party/library/api/vaddf.c
@@ -0,0 +1,124 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ('Cadence    */
+/* Libraries') are the copyrighted works of Cadence Design Systems Inc.     */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+/*  IntegrIT, Ltd.   www.integrIT.com, info@integrIT.com                    */
+/*                                                                          */
+/* NatureDSP_Baseband Library                                               */
+/*                                                                          */
+/* This library contains copyrighted materials, trade secrets and other     */
+/* proprietary information of IntegrIT, Ltd. This software is licensed for  */
+/* use with Cadence processor cores only and must not be used for any other */
+/* processors and platforms. The license to use these sources was given to  */
+/* Cadence, Inc. under Terms and Condition of a Software License Agreement  */
+/* between Cadence, Inc. and IntegrIT, Ltd.                                 */
+/* ------------------------------------------------------------------------ */
+/*          Copyright (C) 2009-2022 IntegrIT, Limited.                      */
+/*                      All Rights Reserved.                                */
+/* ------------------------------------------------------------------------ */
+/*
+  NatureDSP_Baseband library. Vector Operations
+    Real Vectors Sum
+*/
+
+/* Cross-platform data type definitions. */
+/* Common helper macros. */
+#include "api.h"
+#include "common.h"
+#define IVP_SIMD_WIDTH XCHAL_IVPN_SIMD_WIDTH
+/* Vector Operations. */
+
+
+/*-------------------------------------------------------------------------
+Real Vectors Sum
+
+Description: These routines perform pairwise summation of real vectors.
+
+Representation:
+rvadd        Signed fixed-point format. 16-bit inputs, 16-bit saturated results
+rvadd_32b    Signed fixed-point format. 32-bit inputs, 32-bit saturated results
+rvadd_fp16   IEEE-754 Std. half precision floating-point format for
+             input/output data
+rvaddf       IEEE-754 Std. single precision floating-point format for
+             input/output data
+rvadd_f64    IEEE-754 Std. double precision floating-point format for
+             input/output data
+
+Parameters:
+Input:
+x[N]   Input vector
+y[N]   Input vector
+N      Length of vectors
+Output:
+z[N]   Sum of input vectirs
+
+Restrictions:
+z,x,y  Must not overlap
+z,x,y  Aligned on 2*BBE_SIMD_WIDTH-byte boundary
+N      Multiple of BBE_SIMD_WIDTH (rvadd,rvadd_fp16)
+       Multiple of BBE_SIMD_WIDTH/2 (rvadd_32b, rvaddf)
+       Multiple of BBE_SIMD_WIDTH/4 (rvadd_f64)
+-------------------------------------------------------------------------*/
+void rvaddf(float32_t *restrict z, const float32_t *restrict x,
+            const float32_t *restrict y, int N) {
+#if (1)
+  int n;
+  xb_vecN_2xf32 x0, y0, z0;
+  xb_vecN_2xf32 x1, y1, z1;
+  const xb_vecN_2xf32 *restrict pX = (const xb_vecN_2xf32 *)x;
+  const xb_vecN_2xf32 *restrict pY = (const xb_vecN_2xf32 *)y;
+  xb_vecN_2xf32 *restrict pZ = (xb_vecN_2xf32 *)z;
+  NASSERT_ALIGN(x, (2 * IVP_SIMD_WIDTH));
+  NASSERT_ALIGN(y, (2 * IVP_SIMD_WIDTH));
+  NASSERT_ALIGN(z, (2 * IVP_SIMD_WIDTH));
+  NASSERT(N % (IVP_SIMD_WIDTH / 2) == 0);
+  if (N <= 0)
+    return;
+  __Pragma("no_reorder");
+  __Pragma("no_reorder");
+
+  for (n = 0; n < (N >> (LOG2_IVP_SIMD_WIDTH-1)); n++) {
+    IVP_LVN_2XF32_IP(x0, pX, 2 * IVP_SIMD_WIDTH);
+    IVP_LVN_2XF32_IP(y0, pY, 2 * IVP_SIMD_WIDTH);
+    z0 = IVP_ADDN_2XF32(x0, y0);
+    IVP_SVN_2XF32_IP(z0, pZ, 2 * IVP_SIMD_WIDTH);
+  }
+
+  if (N & ((IVP_SIMD_WIDTH>>1) - 1)) {
+	  valign vx0 = IVP_LAN_2XF32_PP(pX);
+	  valign vy0 = IVP_LAN_2XF32_PP(pY);
+	  valign vz0 = IVP_ZALIGN();
+
+	  IVP_LAVN_2XF32_XP(x0, vx0, pX, 2 * ((IVP_SIMD_WIDTH>>1) - 1));
+	  IVP_LAVN_2XF32_XP(y0, vy0, pY, 2 * ((IVP_SIMD_WIDTH>>1) - 1));
+	  z0 = IVP_ADDN_2XF32(x0, y0);
+	  IVP_SAVN_2XF32_XP(z0, vz0, pZ, 2 * ((IVP_SIMD_WIDTH>>1) - 1));
+	  IVP_SAPOSN_2XF32_FP(vz0, pZ);
+  }
+#else
+  int n;
+  xtfloat x0, y0, z0;
+  const xtfloat *restrict pX = (const xtfloat *)x;
+  const xtfloat *restrict pY = (const xtfloat *)y;
+  xtfloat *restrict pZ = (xtfloat *)z;
+  NASSERT_ALIGN(x, (2 * IVP_SIMD_WIDTH));
+  NASSERT_ALIGN(y, (2 * IVP_SIMD_WIDTH));
+  NASSERT_ALIGN(z, (2 * IVP_SIMD_WIDTH));
+  NASSERT(N % (IVP_SIMD_WIDTH / 2) == 0);
+  if (N <= 0)
+    return;
+
+  for (n = 0; n < (N); n++) {
+    XT_LSIP(x0, pX, sizeof(xtfloat));
+    XT_LSIP(y0, pY, sizeof(xtfloat));
+    z0 = XT_ADD_S(x0, y0);
+    XT_SSIP(z0, pZ, sizeof(xtfloat));
+  }
+#endif
+}
diff --git a/backends/cadence/vision/third-party/library/api/vdot_zeropt.c b/backends/cadence/vision/third-party/library/api/vdot_zeropt.c
new file mode 100644
index 00000000000..9c0b25956d5
--- /dev/null
+++ b/backends/cadence/vision/third-party/library/api/vdot_zeropt.c
@@ -0,0 +1,123 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ('Cadence    */
+/* Libraries') are the copyrighted works of Cadence Design Systems Inc.     */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+
+#include <stdio.h>
+#include "api.h"
+#include "common.h"
+
+// Macro to emulate reduction of N 32-bit elements from Nx48
+#define IVP_RADDNX32W_EMULATED(vecNx48) ({ \
+  xb_vecN_2x32v q0 = IVP_CVT32SNX48H(vecNx48); \
+  xb_vecN_2x32v q1 = IVP_CVT32SNX48L(vecNx48); \
+  xb_int32v s0 = IVP_RADDN_2X32(q0); \
+  xb_int32v s1 = IVP_RADDN_2X32(q1); \
+  s0 + s1; \
+})
+
+/*-------------------------------------------------------------------------
+  Vector Dot Product with Zero-Point Subtraction
+
+  Description: This routine performs dot product of two quantized int8 vectors
+  with zero-point subtraction applied before multiplication:
+    result = init_acc + sum((x[i] - x_zp) * (y[i] - y_zp)) for i=0..N-1
+
+  This is commonly used in quantized neural network operations where
+  zero-point offset needs to be removed before computation.
+
+  Representation:
+  rvdot_zeropt   Signed fixed-point format. 8-bit inputs, 32-bit result
+
+  Parameters:
+  Input:
+  init_acc  Initial accumulator value (int32)
+  x[N]      Input vector (int8)
+  y[N]      Input vector (int8)
+  x_zp      Zero-point for x vector (int8)
+  y_zp      Zero-point for y vector (int8)
+  N         Length of vectors
+
+  Output:
+            Returns 32-bit accumulated dot product result
+
+  Restrictions:
+  x,y       Aligned on 2*BBE_SIMD_WIDTH-byte boundary preferred
+  N         Any positive value (tail handling included)
+-------------------------------------------------------------------------*/
+int32_t rvdot_zeropt(
+    int32_t init_acc,
+    const int8_t *restrict x,
+    const int8_t *restrict y,
+    int8_t x_zp,
+    int8_t y_zp,
+    int N) {
+  
+  const xb_vecNx8 *restrict pX = (const xb_vecNx8 *)x;
+  const xb_vecNx8 *restrict pY = (const xb_vecNx8 *)y;
+  
+  xb_vecNx48 acc = 0;  // Initialize accumulator to zero
+  xb_vecNx16 vx, vy;
+  xb_vecNx16 vx_shifted, vy_shifted;
+  
+  int k;
+  
+  if (N <= 0)
+    return init_acc;
+  
+  // Process in chunks of IVP_SIMD_WIDTH (typically 32 elements) using Nx16
+  for (k = 0; k < (N >> LOG2_IVP_SIMD_WIDTH); k++) {
+    // Load vectors as Nx8 with sign-extension to Nx16 (loads N int8 elements)
+    IVP_LVNX8S_IP(vx, pX, IVP_SIMD_WIDTH);
+    IVP_LVNX8S_IP(vy, pY, IVP_SIMD_WIDTH);
+    
+    // Subtract zero-points in 16-bit: (x - x_zp), (y - y_zp)
+    vx_shifted = IVP_SUBNX16(vx, (int16_t)x_zp);
+    vy_shifted = IVP_SUBNX16(vy, (int16_t)y_zp);
+    
+    // Multiply-accumulate: acc += (x - x_zp) * (y - y_zp)
+    IVP_MULANX16(acc, vx_shifted, vy_shifted);
+  }
+  
+  // Handle remaining elements with SIMD
+  int processed = k << LOG2_IVP_SIMD_WIDTH;
+  int remaining = N - processed;
+  
+  if (remaining > 0) {
+    valign vaX = IVP_LANX8S_PP((const xb_vecNx8 *)pX);
+    valign vaY = IVP_LANX8S_PP((const xb_vecNx8 *)pY);
+    
+    // Load remaining elements with variable alignment
+    IVP_LAVNX8S_XP(vx, vaX, (const xb_vecNx8 *)pX, remaining);
+    IVP_LAVNX8S_XP(vy, vaY, (const xb_vecNx8 *)pY, remaining);
+    
+    // Subtract zero-points in 16-bit
+    vx_shifted = IVP_SUBNX16(vx, (int16_t)x_zp);
+    vy_shifted = IVP_SUBNX16(vy, (int16_t)y_zp);
+    
+    // Create mask for valid elements (true for indices < remaining)
+    vboolN mask = IVP_LTNX16(IVP_SEQNX16(), remaining);
+    
+    // Zero out invalid positions: keep valid values, replace invalid with 0
+    vx_shifted = IVP_MOVNX16T(vx_shifted, IVP_ZERONX16(), mask);
+    vy_shifted = IVP_MOVNX16T(vy_shifted, IVP_ZERONX16(), mask);
+    
+    // Multiply-accumulate for tail (accumulate into same acc)
+    // Invalid positions are 0*0 = 0, so they don't contribute
+    IVP_MULANX16(acc, vx_shifted, vy_shifted);
+  }
+  
+  // Reduce accumulator to single int32 (after all elements processed)
+  int32_t result = IVP_RADDNX32W_EMULATED(acc);
+  
+  // Add initial accumulator value
+  result += init_acc;
+  
+  return result;
+}
diff --git a/backends/cadence/vision/third-party/library/api/vsoftmaxf.c b/backends/cadence/vision/third-party/library/api/vsoftmaxf.c
index 27487c75d6c..7e85a8b9c73 100644
--- a/backends/cadence/vision/third-party/library/api/vsoftmaxf.c
+++ b/backends/cadence/vision/third-party/library/api/vsoftmaxf.c
@@ -63,33 +63,23 @@ y[N]   result, Q7.8 or floating point
 x,y    Must not overlap
 -------------------------------------------------------------------------*/
 
-#define IVP_ADDSN_2X32(b_, c_)         \
-  ({                                   \
-    xb_vecN_2x32v a_;                  \
-    xb_vecN_2x64w tmp_a_;              \
-    tmp_a_ = IVP_MULN_2X32(b_, 1);     \
-    IVP_MULAN_2X32(tmp_a_, c_, 1);     \
-    a_ = IVP_PACKVRN_2X64W(tmp_a_, 0); \
-    a_;                                \
-  })
-
 #if !HAVE_VFPU
-DISCARD_FUN(void, vsoftmaxf, (float32_t * y, const float32_t* x, int N))
+DISCARD_FUN(void, vsoftmaxf, (float32_t * y, const float32_t *x, int N))
 #else
-void vsoftmaxf(float32_t* y, const float32_t* x, int N) {
+void vsoftmaxf(float32_t *y, const float32_t *x, int N) {
 #if !defined(IVP_MULN_2X32)
 #else
-  const int* pTbl = (const int*)expftbl_Q30;
+  const int *pTbl = (const int *)expftbl_Q30;
 #endif
-  const xb_vecN_2xf32* restrict pX;
-  xb_vecN_2xf32* restrict pY;
+  const xb_vecN_2xf32 *restrict pX;
+  xb_vecN_2xf32 *restrict pY;
   xb_vecN_2xf32 norm, ysum, xmax;
   int n;
   valign al_X, al_R, al_Y;
   if (N < 0)
     return;
   xmax = minusInff.f;
-  pX = (const xb_vecN_2xf32*)x;
+  pX = (const xb_vecN_2xf32 *)x;
   al_X = IVP_LAN_2XF32_PP(pX);
   al_Y = IVP_ZALIGN();
   for (n = 0; n < (N >> (LOG2_IVP_SIMD_WIDTH - 1)); n++) {
@@ -99,17 +89,17 @@ void vsoftmaxf(float32_t* y, const float32_t* x, int N) {
   }
   if (N & (IVP_SIMD_WIDTH / 2 - 1)) {
     xb_vecN_2xf32 x;
-    IVP_LAVN_2XF32_XP(
-        x, al_X, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
-    IVP_MAXNUMN_2XF32T(
-        xmax, xmax, x, IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
+    IVP_LAVN_2XF32_XP(x, al_X, pX,
+                      sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+    IVP_MAXNUMN_2XF32T(xmax, xmax, x,
+                       IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
   }
 
   xmax = IVP_REPN_2XF32(IVP_RMAXNUMN_2XF32(xmax), 0);
   __Pragma("no_reorder");
   ysum = 0.f;
-  pX = (const xb_vecN_2xf32*)x;
-  pY = (xb_vecN_2xf32*)y;
+  pX = (const xb_vecN_2xf32 *)x;
+  pY = (xb_vecN_2xf32 *)y;
   al_X = IVP_LAN_2XF32_PP(pX);
   {
     vboolN_2 bnan;
@@ -163,8 +153,8 @@ void vsoftmaxf(float32_t* y, const float32_t* x, int N) {
     }
     if (N & (IVP_SIMD_WIDTH / 2 - 1)) {
       xb_vecN_2xf32 x;
-      IVP_LAVN_2XF32_XP(
-          x, al_X, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+      IVP_LAVN_2XF32_XP(x, al_X, pX,
+                        sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
       x = IVP_SUBN_2XF32(x, xmax);
       bnan |= IVP_UNN_2XF32(x, x);
       {
@@ -206,18 +196,18 @@ void vsoftmaxf(float32_t* y, const float32_t* x, int N) {
         zout = IVP_MULN_2XF32(gf, IVP_MOVN_2XF32_FROMN_2X32(exp));
         x = zout;
       }
-      IVP_ADDN_2XF32T(
-          ysum, ysum, x, IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
-      IVP_SAVN_2XF32_XP(
-          x, al_Y, pY, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+      IVP_ADDN_2XF32T(ysum, ysum, x,
+                      IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
+      IVP_SAVN_2XF32_XP(x, al_Y, pY,
+                        sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
     }
     IVP_SAPOSN_2XF32_FP(al_Y, pY);
     ysum = IVP_MOVN_2XF32T(qNaNf.f, ysum, bnan);
   }
   norm = XT_RECIP_S(IVP_RADDN_2XF32(ysum));
   __Pragma("no_reorder");
-  pX = (const xb_vecN_2xf32*)y;
-  pY = (xb_vecN_2xf32*)y;
+  pX = (const xb_vecN_2xf32 *)y;
+  pY = (xb_vecN_2xf32 *)y;
 
   al_R = IVP_LAN_2XF32_PP(pX);
 
@@ -229,11 +219,11 @@ void vsoftmaxf(float32_t* y, const float32_t* x, int N) {
   }
   if (N & (IVP_SIMD_WIDTH / 2 - 1)) {
     xb_vecN_2xf32 x;
-    IVP_LAVN_2XF32_XP(
-        x, al_R, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+    IVP_LAVN_2XF32_XP(x, al_R, pX,
+                      sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
     x = IVP_MULN_2XF32(x, norm);
-    IVP_SAVN_2XF32_XP(
-        x, al_Y, pY, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+    IVP_SAVN_2XF32_XP(x, al_Y, pY,
+                      sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
   }
   IVP_SAPOSN_2XF32_FP(al_Y, pY);
 
diff --git a/backends/cadence/vision/third-party/library/dma.c b/backends/cadence/vision/third-party/library/dma.c
new file mode 100644
index 00000000000..199c9a5debf
--- /dev/null
+++ b/backends/cadence/vision/third-party/library/dma.c
@@ -0,0 +1,62 @@
+/*
+ * dma.c
+ *
+ *  Created on: Oct 30, 2025
+ *      Author: sraut
+ */
+
+#include "lib.h"
+
+// We assume that the DSP uses multichannel IDMA with 2 channels available for 2D transfers (e.g., ping-pong buffers)
+// and 1 channel for 3D transfers.
+
+IDMA_BUFFER_DEFINE(buffer_idma_ch0, 2 * CHL_MAX, IDMA_2D_DESC);
+IDMA_BUFFER_DEFINE(buffer_idma_ch1, 2 * CHL_MAX, IDMA_2D_DESC);
+IDMA_BUFFER_DEFINE(buffer_idma_ch3, 2 * CHL_MAX, IDMA_64B_DESC);
+
+idma_buffer_t *  descbuf[] = {
+    buffer_idma_ch0,
+    buffer_idma_ch1,
+};
+
+// Pointers to DRAM buffers used by softmax
+void *ptr_dram0 = (void *)dram0_pool;
+void *ptr_dram1 = (void *)dram1_pool;
+
+void err_cb_func(const idma_error_details_t *error) {
+  (void) error;
+}
+
+void dma_3dm_init(int ch) {
+  idma_init(ch, 0, MAX_BLOCK_16, 16, TICK_CYCLES_8, 100000, err_cb_func);
+  idma_init_loop(ch, buffer_idma_ch3, IDMA_64B_DESC, CHL_MAX, NULL, NULL);
+}
+
+void dma_2dm_init(int ch) {
+  idma_init(ch, 0, MAX_BLOCK_16, 8, TICK_CYCLES_1, 0, err_cb_func);
+  idma_init_loop(ch, descbuf[ch], IDMA_2D_DESC, CHL_MAX, NULL, NULL);
+}
+
+void dma_3dm(int ch, void *src, void *dst, int src_row_pitch, int dst_row_pitch,
+            int src_tile_pitch, int dst_tile_pitch, int row_sz,
+            int nrows, int ntiles) {
+  (void) idma_copy_3d_desc64(ch, &dst, &src, DESC_IDMA_PRIOR_L /*Default*/, row_sz,
+                          nrows, ntiles, src_row_pitch, dst_row_pitch,
+                          src_tile_pitch, dst_tile_pitch);
+}
+
+
+void dma_2dm(int ch,void *_psrc,void *_pdst, int src_stride, int dst_stride,
+            int num_bytes, short num_lines) {
+  (void) idma_copy_2d_desc(ch, _pdst, _psrc, num_bytes,
+                          DESC_IDMA_PRIOR_L /*Default*/, num_lines, src_stride,
+                          dst_stride);
+}
+
+void dma_1dm(int ch,void *_psrc,void *_pdst, int num_bytes) {
+  (void) idma_copy_2d_desc(ch, _pdst, _psrc, num_bytes, DESC_IDMA_PRIOR_L /*Default*/,
+                          1, 0, 0);
+}
+
+
+
diff --git a/backends/cadence/vision/third-party/library/memory_manager.c b/backends/cadence/vision/third-party/library/memory_manager.c
new file mode 100644
index 00000000000..14e0eeddd6a
--- /dev/null
+++ b/backends/cadence/vision/third-party/library/memory_manager.c
@@ -0,0 +1,44 @@
+/*
+ * memory_manager.c
+ *
+ *  Created on: Dec 8, 2025
+ *      Author: Suraj Raut
+ *
+ *  Description: Definition of DRAM memory pools and local SRAM scratch buffer.
+ *               These must be defined in exactly one compilation unit.
+ */
+
+#include "lib.h"
+#include <xtensa/tie/xt_ivpn.h>  // For XCHAL_IVPN_SIMD_WIDTH
+
+// Memory pools placed in specific DRAM sections
+// These are the actual storage for the DRAM pools
+__attribute__((section(".dram0.data"))) __attribute__((aligned(64*2))) 
+uint8_t dram0_pool[IDMA_BUFFER_SIZE_DRAM0];
+
+__attribute__((section(".dram1.data"))) __attribute__((aligned(64*2))) 
+uint8_t dram1_pool[IDMA_BUFFER_SIZE_DRAM1];
+
+// Cache-mode padded input buffer (in system memory)
+// Used by cache-mode kernels for edge padding before convolution
+// This buffer is accessed through the processor's data cache
+__attribute__((aligned(64*2)))
+int8_t cache_padded_input[CACHE_PADDED_INPUT_SIZE];  // 1 MB max
+
+/**
+ * Allocate DRAM buffer with SIMD alignment
+ */
+int8_t* allocate_dram_buffer(int size, int dram_bank, int* dram0_used, int* dram1_used) {
+    int8_t* ptr;
+    int aligned_size = (size + (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) & ~(2 * XCHAL_IVPN_SIMD_WIDTH - 1);
+    
+    if (dram_bank == 0) {
+        ptr = (int8_t*)(dram0_pool + *dram0_used);
+        *dram0_used += aligned_size;
+    } else {
+        ptr = (int8_t*)(dram1_pool + *dram1_used);
+        *dram1_used += aligned_size;
+    }
+    
+    return ptr;
+}
diff --git a/backends/cadence/vision/third-party/library/tables/expf_tbl.c b/backends/cadence/vision/third-party/library/tables/expf_tbl.c
index f1c6f3d44ae..0ed5dd22257 100644
--- a/backends/cadence/vision/third-party/library/tables/expf_tbl.c
+++ b/backends/cadence/vision/third-party/library/tables/expf_tbl.c
@@ -42,28 +42,22 @@
    p(order)=p(order)-(sum(p)-2);
 */
 const int32_t ALIGN_2SIMD expftbl_Q30[8] = {
-    234841,
-    1329551,
-    10400465,
-    59570027,
-    257946177,
-    744260763,
-    1073741824,
-    0 /* Padding to allow for vector loads */
+    234841,    1329551,   10400465,   59570027,
+    257946177, 744260763, 1073741824, 0 /* Padding to allow for vector loads */
 };
 
 const union ufloat32uint32 ALIGN_2SIMD
     expfminmax[2] = /* minimum and maximum arguments of expf() input */
     {
         {0xc2ce8ed0}, /*-1.0327893066e+002f */
-        {0x42b17218} /* 8.8722839355e+001f */
+        {0x42b17218}  /* 8.8722839355e+001f */
 };
 
 const int32_t invln2_Q30 = 1549082005L; /* 1/ln(2), Q30 */
 
 const union ufloat32uint32 ALIGN_2SIMD log2_e[2] = {
     {0x3fb8aa3b}, /* 1.4426950216      */
-    {0x32a57060} /* 1.9259629891e-008 */
+    {0x32a57060}  /* 1.9259629891e-008 */
 };
 
 /*
@@ -76,10 +70,5 @@ p(order)=p(order)-(sum(p)-2);
 num2hex(single(p));
 */
 const union ufloat32uint32 ALIGN_2SIMD expftblf[] = {
-    {0x39655635},
-    {0x3aa24c7a},
-    {0x3c1eb2d1},
-    {0x3d633ddb},
-    {0x3e75ff24},
-    {0x3f317212},
-    {0x3f800000}};
+    {0x39655635}, {0x3aa24c7a}, {0x3c1eb2d1}, {0x3d633ddb},
+    {0x3e75ff24}, {0x3f317212}, {0x3f800000}};
diff --git a/backends/cadence/vision/third-party/library/tables/inff_tbl.c b/backends/cadence/vision/third-party/library/tables/inff_tbl.c
index 8464ee9f549..9b2bf62e6bf 100644
--- a/backends/cadence/vision/third-party/library/tables/inff_tbl.c
+++ b/backends/cadence/vision/third-party/library/tables/inff_tbl.c
@@ -31,7 +31,7 @@
 #include "dtypes.h"
 
 const union ufloat32uint32 minusInff = {0xff800000}; /* -Inf */
-const union ufloat32uint32 plusInff = {0x7f800000}; /* +Inf */
+const union ufloat32uint32 plusInff = {0x7f800000};  /* +Inf */
 const union ufloat32uint32 realmaxf = {
     0x7f7fffff}; /* maximum floating point number */
 const union ufloat32uint32 realminf = {
diff --git a/backends/cadence/vision/third-party/library/tables/nanf_tbl.c b/backends/cadence/vision/third-party/library/tables/nanf_tbl.c
index f165234fce4..27c5f437b9a 100644
--- a/backends/cadence/vision/third-party/library/tables/nanf_tbl.c
+++ b/backends/cadence/vision/third-party/library/tables/nanf_tbl.c
@@ -27,9 +27,9 @@
 */
 
 /* Portable data types. */
+#include "dtypes.h"
 /* NaN values for single precision routines. */
 #include "nanf_tbl.h"
-#include "dtypes.h"
 
 const union ufloat32uint32 sNaNf = {0x7f800001}; /* Signalling NaN          */
 const union ufloat32uint32 qNaNf = {0x7fc00000}; /* Quiet NaN               */
diff --git a/backends/cadence/vision/third-party/library/utils.c b/backends/cadence/vision/third-party/library/utils.c
new file mode 100644
index 00000000000..05366f88b2e
--- /dev/null
+++ b/backends/cadence/vision/third-party/library/utils.c
@@ -0,0 +1,26 @@
+/*
+ * utils.c
+ *
+ *  Created on: Nov 4, 2025
+ *      Author: sraut
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+
+
+//static inline int inc_iter_to_temp(int *temp, int var, int bound, int carry) {
+//  int new_val = var + carry;
+//  carry = new_val == bound;
+//  *temp = carry ? 0 : new_val;
+//  return carry;
+//}
+//
+//static inline void swap_buffers(uint8_t **a, uint8_t **b) {
+//  uint8_t *t = *a;
+//  *a = *b;
+//  *b = t;
+//}
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv.c
new file mode 100644
index 00000000000..fba6f8bcef4
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv.c
@@ -0,0 +1,1668 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn.h"
+#include <stdio.h>
+#if ((XCHAL_VISION_TYPE >= 6))
+
+/******************************************************************************
+ * 3D convolution general version
+ * Calls a specific convolution function based on parameters
+ *****************************************************************************/
+XAI_ERR_TYPE xaiConvolve3D(const xai_pTile3D inTile,
+                           const xai_pTile4D coeffTile,
+                           const xai_pArray biasArray,
+                           xai_pTile3D outTile,
+                           xai_cnn_conv_params *param)
+{
+  /* The arguments inTile, coeffTile and param are used by xaiGetConvolve3DVariant
+   * helper function, to derive the appropriate convolution variant */
+  if ((!inTile) || (!coeffTile) || (!param))
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  /* Function Pointer */
+  typedef XAI_ERR_TYPE (*fConvPtr)(const xai_pTile3D inTile,
+                                   const xai_pTile4D coeffTile,
+                                   const xai_pArray biasArray,
+                                   xai_pTile3D outTile,
+                                   xai_cnn_conv_params* param);
+
+  /* Getting the function pointer of the convolution variant using xaiGetConvolve3DVariant function */
+  fConvPtr xaiConvolve3D_opt = (fConvPtr) xaiGetConvolve3DVariant(inTile, coeffTile, biasArray, outTile, param);
+
+  if (xaiConvolve3D_opt == NULL)
+  {
+    return(XAI_ERR_NO_VARIANT);
+  }
+  else
+  {
+    return(xaiConvolve3D_opt(inTile, coeffTile, biasArray, outTile, param));
+  }
+}
+
+/**************************************************************************************
+* 3D convolution helper function
+* Returns the function pointer of a specific convolution variant based on parameters
+**************************************************************************************/
+XAI_ERR_TYPE *xaiGetConvolve3DVariant(const xai_pTile3D inTile,
+                                      const xai_pTile4D coeffTile,
+                                      const xai_pArray biasArray,
+                                      xai_pTile3D outTile,
+                                      xai_cnn_conv_params *param)
+{
+  if ((!inTile) || (!coeffTile) || (!param))
+  {
+    return(NULL);
+  }
+
+  xai_cnn_data_order coeffOrder = XAI_TILE4D_GET_DATA_ORDER(coeffTile);
+
+  xai_cnn_data_order inOrder = XAI_TILE3D_GET_DATA_ORDER(inTile);
+  int32_t kWidth, kHeight;
+  uint8_t stride;
+
+  if (coeffOrder == XAI_NDWH)
+  {
+    /* MOD variants */
+    kWidth  = XAI_TILE4D_GET_DIM3(coeffTile);
+    kHeight = XAI_TILE4D_GET_DIM4(coeffTile);
+
+    if (inOrder == XAI_WHD)
+    {
+      if (kWidth == 1 && kHeight == 1)
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH);
+      }
+      else if (kWidth == 3 && kHeight == 3)
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH);
+      }
+      else if (kWidth == 5 && kHeight == 5)
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolve3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH);
+      }
+      else if (kWidth == 7 && kHeight == 7)
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolve3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH);
+      }
+      else
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH);
+      }
+    }
+    else if (inOrder == XAI_DWH)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (kWidth == 1 && kHeight == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1_S8S8IXCa2_MOD_DWH);
+        }
+        else if (kWidth == 3 && kHeight == 3)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3_S8S8IXCa2_MOD_DWH);
+        }
+        else if (kWidth == 5 && kHeight == 5)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_5x5_S8S8IXCa2_MOD_DWH);
+        }
+        else if (kWidth == 7 && kHeight == 7)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_7x7_S8S8IXCa2_MOD_DWH);
+        }
+        else
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxN_S8S8IXCa2_MOD_DWH);
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (kWidth == 1 && kHeight == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1_U8S8IXCa2_MOD_DWH);
+        }
+        else if (kWidth == 3 && kHeight == 3)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3_U8S8IXCa2_MOD_DWH);
+        }
+      }
+    }
+  }
+  else if (coeffOrder == XAI_WHDN)
+  {
+    /* MOW variants */
+    stride  = XAI_CNN_CONV_GET_STRIDE(param);
+    kWidth  = XAI_TILE4D_GET_DIM1(coeffTile);
+    kHeight = XAI_TILE4D_GET_DIM2(coeffTile);
+
+    if (kWidth == 1 && kHeight == 1)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1j1_S8S8IX_MOW_WHD);
+        }
+        else if (stride == 2)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1j2_S8S8IX_MOW_WHD);
+        }
+        else if (stride == 4)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1j4_S8S8IX_MOW_WHD);
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1j1_U8S8IX_MOW_WHD);
+        }
+        if (stride == 2)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1j2_U8S8IX_MOW_WHD);
+        }
+        if (stride == 4)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1j4_U8S8IX_MOW_WHD);
+        }
+      }
+    }
+    else if (kWidth == 3 && kHeight == 3)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3j1_S8S8IX_MOW_WHD);
+        }
+        else if (stride == 2)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3j2_S8S8IX_MOW_WHD);
+        }
+        else if (stride == 4)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3j4_S8S8IX_MOW_WHD);
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3j1_U8S8IX_MOW_WHD);
+        }
+        if (stride == 2)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3j2_U8S8IX_MOW_WHD);
+        }
+        if (stride == 4)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3j4_U8S8IX_MOW_WHD);
+        }
+      }
+    }
+    else if (kWidth == 5 && kHeight == 5)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_5x5j1_S8S8IX_MOW_WHD);
+        }
+        else if (stride == 2)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_5x5j2_S8S8IX_MOW_WHD);
+        }
+        else if (stride == 4)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_5x5j4_S8S8IX_MOW_WHD);
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_5x5j1_U8S8IX_MOW_WHD);
+        }
+        if (stride == 2)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_5x5j2_U8S8IX_MOW_WHD);
+        }
+        if (stride == 4)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_5x5j4_U8S8IX_MOW_WHD);
+        }
+      }
+    }
+    else if (kWidth == 7 && kHeight == 7)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_7x7j1_S8S8IX_MOW_WHD);
+        }
+        else if (stride == 2)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_7x7j2_S8S8IX_MOW_WHD);
+        }
+        else if (stride == 4)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_7x7j4_S8S8IX_MOW_WHD);
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_7x7j1_U8S8IX_MOW_WHD);
+        }
+        if (stride == 2)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_7x7j2_U8S8IX_MOW_WHD);
+        }
+        if (stride == 4)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_7x7j4_U8S8IX_MOW_WHD);
+        }
+      }
+    }
+    else
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxNj1_S8S8IX_MOW_WHD);
+        }
+        else if (stride == 2)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxNj2_S8S8IX_MOW_WHD);
+        }
+        else if (stride == 4)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxNj4_S8S8IX_MOW_WHD);
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxNj1_U8S8IX_MOW_WHD);
+        }
+        if (stride == 2)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxNj2_U8S8IX_MOW_WHD);
+        }
+        if (stride == 4)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxNj4_U8S8IX_MOW_WHD);
+        }
+      }
+    }
+  }
+  else if (coeffOrder == XAI_DWHN)
+  {
+    /* SO variants */
+    if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+    {
+      return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxN_S8S8IX_SO_DWH);
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+    {
+      return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxN_U8S8IX_SO_DWH);
+    }
+  }
+
+  return(NULL);
+}
+
+/******************************************************************************
+ * 3D convolution general version for dilation functions
+ * Calls a specific dilated convolution function based on parameters
+ *****************************************************************************/
+XAI_ERR_TYPE xaiConvolved3D(const xai_pTile3D inTile,
+                            const xai_pTile4D coeffTile,
+                            const xai_pArray biasArray,
+                            xai_pTile3D outTile,
+                            const xai_cnn_conv_params *param)
+{
+  /* The arguments inTile, coeffTile and param are used by xaiGetConvolved3DVariant
+   * helper function, to derive the appropriate convolution variant */
+  if ((!inTile) || (!coeffTile) || (!param))
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  /* Function Pointer */
+  typedef XAI_ERR_TYPE (*fConvdPtr)(const xai_pTile3D inTile,
+                                    const xai_pTile4D coeffTile,
+                                    const xai_pArray biasArray,
+                                    xai_pTile3D outTile,
+                                    const xai_cnn_conv_params* param);
+
+  /* Getting the function pointer of the convolution variant using xaiGetConvolved3DVariant function*/
+  fConvdPtr xaiConvolve3D_opt =
+    (fConvdPtr) xaiGetConvolved3DVariant(inTile, coeffTile, biasArray, outTile, param);
+
+  if (xaiConvolve3D_opt == NULL)
+  {
+    return(XAI_ERR_NO_VARIANT);
+  }
+  else
+  {
+    return(xaiConvolve3D_opt(inTile, coeffTile, biasArray, outTile, param));
+  }
+}
+
+/*********************************************************************************************
+* 3D dilated convolution helper function
+* Returns the function pointer of a specific dilated convolution variant based on parameters
+*********************************************************************************************/
+XAI_ERR_TYPE *xaiGetConvolved3DVariant(const xai_pTile3D inTile,
+                                       const xai_pTile4D coeffTile,
+                                       const xai_pArray biasArray,
+                                       xai_pTile3D outTile,
+                                       const xai_cnn_conv_params *param)
+{
+  if ((!inTile) || (!coeffTile) || (!param))
+  {
+    return(NULL);
+  }
+
+  uint8_t stride;
+  uint8_t dilation;
+  xai_cnn_data_order coeffOrder = XAI_TILE4D_GET_DATA_ORDER(coeffTile);
+
+
+  int32_t kWidth, kHeight;
+  xai_cnn_data_order inOrder = XAI_TILE3D_GET_DATA_ORDER(inTile);
+
+  if (coeffOrder == XAI_NDWH)
+  {
+    /* MOD variants */
+    kWidth  = XAI_TILE4D_GET_DIM3(coeffTile);
+    kHeight = XAI_TILE4D_GET_DIM4(coeffTile);
+
+    if (inOrder == XAI_WHD)
+    {
+      if (kWidth == 1 && kHeight == 1)
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH);
+      }
+      else if (kWidth == 2 && kHeight == 2)
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolved3D_S_2x2_S8S8IXCa2_MOD_WHD_DWH);
+      }
+      else if (kWidth == 3 && kHeight == 3)
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH);
+      }
+      else if (kWidth == 4 && kHeight == 4)
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolved3D_S_4x4_S8S8IXCa2_MOD_WHD_DWH);
+      }
+      else if (kWidth == 5 && kHeight == 5)
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH);
+      }
+      else if (kWidth == 7 && kHeight == 7)
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH);
+      }
+      else
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH);
+      }
+    }
+    else if (inOrder == XAI_DWH)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) && XAI_TILE4D_CHECK_TYPE(coeffTile, XAI_S8))
+      {
+        if (XAI_CNN_CONV_GET_STRIDEX(param) != XAI_CNN_CONV_GET_STRIDEY(param))
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH);
+        }
+        else if (XAI_CNN_CONV_GET_STRIDE(param) != 1 && XAI_CNN_CONV_GET_STRIDE(param) != 2 \
+                 && XAI_CNN_CONV_GET_STRIDE(param) != 4)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH);
+        }
+        else if (kWidth == 1 && kHeight == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1_S8S8IXCa2_MOD_DWH);
+        }
+        else if (kWidth == 2 && kHeight == 2)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolved3D_S_2x2_S8S8IXCa2_MOD_DWH);
+        }
+        else if (kWidth == 3 && kHeight == 3)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3_S8S8IXCa2_MOD_DWH);
+        }
+        else if (kWidth == 4 && kHeight == 4)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolved3D_S_4x4_S8S8IXCa2_MOD_DWH);
+        }
+        else if (kWidth == 5 && kHeight == 5)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5_S8S8IXCa2_MOD_DWH);
+        }
+        else if (kWidth == 7 && kHeight == 7)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7_S8S8IXCa2_MOD_DWH);
+        }
+        else
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH);
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8) && XAI_TILE4D_CHECK_TYPE(coeffTile, XAI_S8))
+      {
+        if (kWidth == 1 && kHeight == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1_U8S8IXCa2_MOD_DWH);
+        }
+        else if (kWidth == 3 && kHeight == 3)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3_U8S8IXCa2_MOD_DWH);
+        }
+        else
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxN_U8S8IXCa2_MOD_DWH);
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16) && XAI_TILE4D_CHECK_TYPE(coeffTile, XAI_S16))
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxN_S16S16I16_MOD_DWH);
+      }
+    }
+  }
+  else if (coeffOrder == XAI_WHDN)
+  {
+    /* MOW variants */
+    stride   = XAI_CNN_CONV_GET_STRIDE(param);
+    dilation = XAI_CNN_CONV_GET_DILATION(param);
+    kWidth   = XAI_TILE4D_GET_DIM1(coeffTile);
+    kHeight  = XAI_TILE4D_GET_DIM2(coeffTile);
+
+    if (kWidth == 1 && kHeight == 1)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1j1d1_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1j2d1_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1j4d1_S8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1j1d1_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1j2d1_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1j4d1_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD);
+          }
+        }
+      }
+#if 0  /* F16 disabled - no implementation available */
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_F16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1j1d1_F16_MOW_WHD);
+          }
+        }
+      }
+#endif
+    }
+    else if (kWidth == 2 && kHeight == 2)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_2x2j1d1_S8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_2x2j1d1_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD);
+          }
+        }
+        if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD);
+          }
+        }
+        if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD);
+          }
+        }
+      }
+#if 0  /* F16 disabled - no implementation available */
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_F16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_2x2j1d1_F16_MOW_WHD);
+          }
+        }
+      }
+#endif
+    }
+    else if (kWidth == 3 && kHeight == 3)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j1d1_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j1d2_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j1d4_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j2d1_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j4d1_S8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j1d1_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j1d2_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j1d4_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j2d1_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j4d1_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD);
+          }
+        }
+      }
+#if 0  /* F16 disabled - no implementation available */
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_F16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j1d1_F16_MOW_WHD);
+          }
+        }
+        if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j2d1_F16_MOW_WHD);
+          }
+        }
+      }
+#endif
+    }
+    else if (kWidth == 4 && kHeight == 4)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_4x4j1d1_S8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_4x4j1d1_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD);
+          }
+        }
+        if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD);
+          }
+        }
+        if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD);
+          }
+        }
+      }
+    }
+    else if (kWidth == 5 && kHeight == 5)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j1d1_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j1d2_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j1d4_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j2d1_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j4d1_S8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j1d1_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j1d2_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j1d4_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j2d1_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j4d1_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD);
+          }
+        }
+      }
+    }
+    else if (kWidth == 7 && kHeight == 7)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j1d1_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j1d2_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j1d4_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j2d1_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j4d1_S8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j1d1_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j1d2_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j1d4_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j2d1_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j4d1_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD);
+          }
+        }
+      }
+    }
+    else
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d2_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d4_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_S8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d2_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d4_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD);
+          }
+        }
+      }
+#if 0  /* F16 disabled - no implementation available */
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_F16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_F16_MOW_WHD);
+          }
+        }
+      }
+#endif
+    }
+  }
+  else if (coeffOrder == XAI_DWHN)
+  {
+    /* SO variants */
+    if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+    {
+      return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxN_S8S8IX_SO_DWH);
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+    {
+      return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxN_U8S8IX_SO_DWH);
+    }
+  }
+
+  return(NULL);
+}
+
+/******************************************************************************
+ * Depthwise convolution general version
+ * Calls a specific depthwise convolution function based in parameters
+ *****************************************************************************/
+XAI_ERR_TYPE xaiDepthwiseConvolve2D(const xai_pTile3D inTile,
+                                    const xai_pTile3D coeffTile,
+                                    const xai_pArray biasArray,
+                                    xai_pTile3D outTile,
+                                    const xai_cnn_conv_params *param)
+{
+  /* The arguments inTile, coeffTile and param are used by xaiGetDepthwiseConvolve2DVariant
+   * helper function, to derive the appropriate convolution variant */
+  if ((!inTile) || (!coeffTile) || (!param))
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  /* Function Pointer */
+  typedef XAI_ERR_TYPE (*fDepthwiseConvPtr)(const xai_pTile3D inTile,
+                                            const xai_pTile3D coeffTile,
+                                            const xai_pArray biasArray,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_conv_params* param);
+
+  /* Getting the function pointer of the convolution variant using xaiGetDepthwiseConvolve2DVariant function */
+  fDepthwiseConvPtr xaiDepthwiseConvolve2D_opt = (fDepthwiseConvPtr) xaiGetDepthwiseConvolve2DVariant(inTile,
+                                                                                                      coeffTile,
+                                                                                                      biasArray,
+                                                                                                      outTile,
+                                                                                                      param);
+
+  if (xaiDepthwiseConvolve2D_opt == NULL)
+  {
+    return(XAI_ERR_NO_VARIANT);
+  }
+  else
+  {
+    return(xaiDepthwiseConvolve2D_opt(inTile, coeffTile, biasArray, outTile, param));
+  }
+}
+
+/**************************************************************************************
+* 2D depthwise convolution helper function
+* Returns the function pointer of a specific depthwiseconvolution variant based
+* on parameters
+**************************************************************************************/
+XAI_ERR_TYPE *xaiGetDepthwiseConvolve2DVariant(const xai_pTile3D inTile,
+                                               const xai_pTile3D coeffTile,
+                                               const xai_pArray biasArray,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_conv_params *param)
+{
+  if ((!inTile) || (!coeffTile) || (!outTile) || (!param))
+  {
+    return(NULL);
+  }
+  if (!(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile)))
+  {
+    return(NULL);
+  }
+
+  xai_cnn_data_order inOrder    = XAI_TILE3D_GET_DATA_ORDER(inTile);
+  xai_cnn_data_order coeffOrder = XAI_TILE3D_GET_DATA_ORDER(coeffTile);
+  int32_t kWidth, kHeight;
+
+  if (coeffOrder == XAI_DWH)
+  {
+    /* MOD variants */
+    kWidth  = XAI_TILE3D_GET_DIM2(coeffTile);
+    kHeight = XAI_TILE3D_GET_DIM3(coeffTile);
+    if (inOrder == XAI_DWH)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (XAI_CNN_CONV_GET_STRIDEX(param) != XAI_CNN_CONV_GET_STRIDEY(param))
+        {
+          if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxN_S8S8IXCa2_MOD_DWH);
+          }
+          else
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxN_U8S8IXCa2_MOD_DWH);
+          }
+        }
+        else if (XAI_CNN_CONV_GET_STRIDE(param) != 1 && XAI_CNN_CONV_GET_STRIDE(param) != 2 \
+                 && XAI_CNN_CONV_GET_STRIDE(param) != 4)
+        {
+          if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxN_S8S8IXCa2_MOD_DWH);
+          }
+          else
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxN_U8S8IXCa2_MOD_DWH);
+          }
+        }
+        else if (kWidth == 3 && kHeight == 3)
+        {
+          if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_3x3_S8S8IXCa2_MOD_DWH);
+          }
+          else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_3x3_U8S8IXCa2_MOD_DWH);
+          }
+        }
+        else if (kWidth == 5 && kHeight == 5)
+        {
+          if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_5x5_S8S8IXCa2_MOD_DWH);
+          }
+          else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_5x5_U8S8IXCa2_MOD_DWH);
+          }
+        }
+        else if (kWidth == 7 && kHeight == 7 && XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_7x7_S8S8IXCa2_MOD_DWH);
+        }
+        else
+        {
+          if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxN_S8S8IXCa2_MOD_DWH);
+          }
+          else
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxN_U8S8IXCa2_MOD_DWH);
+          }
+        }
+      } /* if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) */
+      else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16))
+      {
+        return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxN_S16S16I16_MOD_DWH);
+      } /* if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16))*/
+    }   /* if(inOrder == XAI_DWH) */
+  }     /* if(coeffOrder == XAI_DWH) */
+  else if (coeffOrder == XAI_WHD)
+  {
+    /* MOW variants */
+    uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param);
+    kWidth  = XAI_TILE3D_GET_DIM1(coeffTile);
+    kHeight = XAI_TILE3D_GET_DIM2(coeffTile);
+    if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+    {
+      if (kWidth == 3 && kHeight == 3)
+      {
+        if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+        {
+          if (stride == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_3x3j1_S8S8IX_MOW_WHD);
+          }
+          else if (stride == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_3x3j2_S8S8IX_MOW_WHD);
+          }
+          else if (stride == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_3x3j4_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+        {
+          if (stride == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_3x3j1_U8S8IX_MOW_WHD);
+          }
+          else if (stride == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_3x3j2_U8S8IX_MOW_WHD);
+          }
+          else if (stride == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_3x3j4_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (kWidth == 5 && kHeight == 5)
+      {
+        if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+        {
+          if (stride == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_5x5j1_S8S8IX_MOW_WHD);
+          }
+          else if (stride == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_5x5j2_S8S8IX_MOW_WHD);
+          }
+          else if (stride == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_5x5j4_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+        {
+          if (stride == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_5x5j1_U8S8IX_MOW_WHD);
+          }
+          else if (stride == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_5x5j2_U8S8IX_MOW_WHD);
+          }
+          else if (stride == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_5x5j4_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (kWidth == 7 && kHeight == 7)
+      {
+        if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+        {
+          if (stride == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_7x7j1_S8S8IX_MOW_WHD);
+          }
+          else if (stride == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_7x7j2_S8S8IX_MOW_WHD);
+          }
+          else if (stride == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_7x7j4_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+        {
+          if (stride == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_7x7j1_U8S8IX_MOW_WHD);
+          }
+          else if (stride == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_7x7j2_U8S8IX_MOW_WHD);
+          }
+          else if (stride == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_7x7j4_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else
+      {
+        /* MOW Variants */
+        if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+        {
+          if (stride == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj1_S8S8IX_MOW_WHD);
+          }
+          else if (stride == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj2_S8S8IX_MOW_WHD);
+          }
+          else if (stride == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj4_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+        {
+          if (stride == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj1_U8S8IX_MOW_WHD);
+          }
+          else if (stride == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj2_U8S8IX_MOW_WHD);
+          }
+          else if (stride == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj4_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+/* #if XCHAL_VISION_QUAD_MAC_TYPE != 0 */
+    } /* if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) */
+    else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16))
+    {
+      if (stride == 1)
+      {
+        return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj1_S16S16I16_MOW_WHD);
+      }
+      else if (stride == 2)
+      {
+        return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj2_S16S16I16_MOW_WHD);
+      }
+      else if (stride == 4)
+      {
+        return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj4_S16S16I16_MOW_WHD);
+      }
+    } /* if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)) */
+  }   /*  if(coeffOrder == XAI_WHD) */
+  return(NULL);
+}
+
+/******************************************************************************
+ * Depthwise dilated convolution general version
+ * Calls a specific depthwise dilated convolution function based in parameters
+ *****************************************************************************/
+XAI_ERR_TYPE xaiDepthwiseConvolved2D(const xai_pTile3D inTile,
+                                     const xai_pTile3D coeffTile,
+                                     const xai_pArray biasArray,
+                                     xai_pTile3D outTile,
+                                     const xai_cnn_depthwiseDilatedConv_params *param)
+{
+  /* The arguments inTile, coeffTile and param are used by xaiGetDepthwiseConvolved2DVariant
+   * helper function, to derive the appropriate convolution variant */
+  if ((!inTile) || (!coeffTile) || (!param))
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  /* Function Pointer */
+  typedef XAI_ERR_TYPE (*fDepthwiseConvdPtr)(const xai_pTile3D inTile,
+                                             const xai_pTile3D coeffTile,
+                                             const xai_pArray biasArray,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_depthwiseDilatedConv_params* param);
+
+  /* Getting the function pointer of the convolution variant using xaiGetDepthwiseConvolved2DVariant function */
+  fDepthwiseConvdPtr xaiDepthwiseConvolved2D_opt = (fDepthwiseConvdPtr) xaiGetDepthwiseConvolved2DVariant(inTile, coeffTile, biasArray, outTile, param);
+
+  if (xaiDepthwiseConvolved2D_opt == NULL)
+  {
+    return(XAI_ERR_NO_VARIANT);
+  }
+  else
+  {
+    return(xaiDepthwiseConvolved2D_opt(inTile, coeffTile, biasArray, outTile, param));
+  }
+}
+
+/**************************************************************************************
+* 2D depthwise convolution helper function
+* Returns the function pointer of a specific depthwiseconvolution variant based
+* on parameters
+**************************************************************************************/
+XAI_ERR_TYPE *xaiGetDepthwiseConvolved2DVariant(const xai_pTile3D inTile,
+                                                const xai_pTile3D coeffTile,
+                                                const xai_pArray biasArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_depthwiseDilatedConv_params *param)
+{
+  if ((!inTile) || (!coeffTile) || (!param))
+  {
+    return(NULL);
+  }
+
+  xai_cnn_data_order inOrder    = XAI_TILE3D_GET_DATA_ORDER(inTile);
+  xai_cnn_data_order coeffOrder = XAI_TILE3D_GET_DATA_ORDER(coeffTile);
+#if (XCHAL_HAVE_SUPERGATHER == 0)
+  int32_t depthMultiplier = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DEPTH_MULTIPLIER(param);
+#endif
+  uint8_t stride;
+  uint8_t dilation;
+
+  int32_t kWidth, kHeight;
+  if (coeffOrder == XAI_DWH)
+  {
+    kWidth  = XAI_TILE3D_GET_DIM2(coeffTile);
+    kHeight = XAI_TILE3D_GET_DIM3(coeffTile);
+    /* MOD variants */
+    if (inOrder == XAI_DWH)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_MxN_U8S8IX_MOD_DWH);
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+#if (XCHAL_HAVE_SUPERGATHER == 0)
+        if (kWidth == 3 && kHeight == 3 && depthMultiplier != 8)
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_3x3_S8S8IX_MOD_DWH);
+        }
+        else if (kWidth == 5 && kHeight == 5 && depthMultiplier != 8)
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_5x5_S8S8IX_MOD_DWH);
+        }
+        else if (kWidth == 7 && kHeight == 7 && depthMultiplier != 8)
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_7x7_S8S8IX_MOD_DWH);
+        }
+#else
+        if (kWidth == 3 && kHeight == 3)
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_3x3_S8S8IX_MOD_DWH);
+        }
+        else if (kWidth == 5 && kHeight == 5)
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_5x5_S8S8IX_MOD_DWH);
+        }
+        else if (kWidth == 7 && kHeight == 7)
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_7x7_S8S8IX_MOD_DWH);
+        }
+#endif
+        else
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_MxN_S8S8IX_MOD_DWH);
+        }
+      }
+      else /* (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) */
+      {
+        if (kWidth == 3 && kHeight == 3)
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_3x3_S16S16I16_MOD_DWH);
+        }
+        else if (kWidth == 5 && kHeight == 5)
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_5x5_S16S16I16_MOD_DWH);
+        }
+        else
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_MxN_S16S16I16_MOD_DWH);
+        }
+      }
+    }
+  }
+  /*else*/ if (coeffOrder == XAI_WHD)
+  {
+    /* MOW variants */
+
+    stride   = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param);
+    dilation = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param);
+//#endif
+    /*if(kWidth == 3 && kHeight == 3)
+       {
+       if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+       {
+        if(stride == 1)
+        {
+          if(dilation == 2)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_3x3j1d2_S8S8IX_MOW_WHD);
+          }
+          else if(dilation == 4)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_3x3j1d4_S8S8IX_MOW_WHD);
+           }
+        }
+       }
+       else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+       {
+        if(stride == 1)
+        {
+          if(dilation == 2)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_3x3j1d2_U8S8IX_MOW_WHD);
+          }
+          else if(dilation == 4)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_3x3j1d4_U8S8IX_MOW_WHD);
+           }
+        }
+       }
+       }*/
+    /*else if(kWidth == 5 && kHeight == 5)
+       {
+       if (xaiTile3DCheckType(inTile, XAI_S8))
+       {
+        if(stride == 1)
+        {
+          if(dilation == 2)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_5x5j1d2_S8S8IX_MOW_WHD);
+          }
+          else if(dilation == 4)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_5x5j1d4_S8S8IX_MOW_WHD);
+          }
+        }
+       }
+       else if (xaiTile3DCheckType(inTile, XAI_U8))
+       {
+        if(stride == 1)
+        {
+          if(dilation == 2)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_5x5j1d2_U8S8IX_MOW_WHD);
+          }
+          else if(dilation == 4)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_5x5j1d4_U8S8IX_MOW_WHD);
+          }
+        }
+       }
+       }
+       else if(kWidth == 7 && kHeight == 7)
+       {
+       if (xaiTile3DCheckType(inTile, XAI_S8))
+       {
+        if(stride == 1)
+        {
+          if(dilation == 2)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_7x7j1d2_S8S8IX_MOW_WHD);
+          }
+          else if(dilation == 4)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_7x7j1d4_S8S8IX_MOW_WHD);
+           }
+        }
+       }
+       else if (xaiTile3DCheckType(inTile, XAI_U8))
+       {
+        if(stride == 1)
+        {
+          if(dilation == 2)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_7x7j1d2_U8S8IX_MOW_WHD);
+          }
+          else if(dilation == 4)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_7x7j1d4_U8S8IX_MOW_WHD);
+          }
+        }
+       }
+       }*/
+    /* else */
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_MxNj1d2_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_MxNj1d4_S8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_MxNj1d2_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_MxNj1d4_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+//#endif
+    }
+  }
+  return(NULL);
+}
+#endif //if ((XCHAL_VISION_TYPE >= 6))
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOD.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOD.c
new file mode 100644
index 00000000000..6a704532585
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOD.c
@@ -0,0 +1,510 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+#include "limits.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+/******************************************************************************************
+* MOD WHD variants
+******************************************************************************************/
+
+
+/*****************************************************************************
+*  xaiConvolve3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution              */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 1x1xDxN                                     */
+/*               Input is in WHD and Output is in DWH format                */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+XAI_ERR_TYPE xaiConvolve3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params *param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(xaiConvolved3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolve3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution              */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 3x3xDxN                                     */
+/*               Input is in WHD and Output is in DWH format                */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+XAI_ERR_TYPE xaiConvolve3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params *param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(xaiConvolved3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolve3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution              */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 5x5xDxN                                     */
+/*               Input is in WHD and Output is in DWH format                */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+XAI_ERR_TYPE xaiConvolve3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params *param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(xaiConvolved3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolve3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution .            */
+/*               Stride values = 1, 2 and 4 are supported                   */
+/*               Implementation also supports dilation >= 1 for stride = 1  */
+/*               and dilation = 1 for stride = 2, 4                         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 7x7xDxN                                     */
+/*               Input is in WHD and Output is in DWH format                */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+XAI_ERR_TYPE xaiConvolve3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params *param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(xaiConvolved3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolve3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution              */
+/*               Stride values = 1, 2 and 4 are supported                   */
+/*               Implementation also supports dilation >= 1 for stride = 1  */
+/*               and dilation = 1 for stride = 2, 4                         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk                                    */
+/*               Input is in WHD and Output is in DWH format                */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+XAI_ERR_TYPE xaiConvolve3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params *param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+* MOD DWH variants
+******************************************************************************************/
+
+
+
+/*****************************************************************************
+*  xaiConvolve3D_S_1x1_S8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution              */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 1x1xDxN                                     */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+XAI_ERR_TYPE xaiConvolve3D_S_1x1_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params *param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(xaiConvolved3D_S_1x1_S8S8IXCa2_MOD_DWH(inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolve3D_S_1x1_U8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution              */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is U8, CoeffData is S8                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 1x1xDxN                                     */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+XAI_ERR_TYPE xaiConvolve3D_S_1x1_U8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params *param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(xaiConvolved3D_S_1x1_U8S8IXCa2_MOD_DWH(inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolve3D_S_3x3_S8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution              */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 3x3xDxN                                     */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+XAI_ERR_TYPE xaiConvolve3D_S_3x3_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params *param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(xaiConvolved3D_S_3x3_S8S8IXCa2_MOD_DWH(inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolve3D_S_3x3_U8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution              */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is U8, CoeffData is S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 3x3xDxN                                     */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+XAI_ERR_TYPE xaiConvolve3D_S_3x3_U8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params *param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(xaiConvolved3D_S_3x3_U8S8IXCa2_MOD_DWH(inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolve3D_S_5x5_S8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution              */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 5x5xDxN                                     */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+XAI_ERR_TYPE xaiConvolve3D_S_5x5_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params *param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(xaiConvolved3D_S_5x5_S8S8IXCa2_MOD_DWH(inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolve3D_S_7x7_S8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution              */
+/*               Stride values = 1, 2 and 4 are supported.                  */
+/*               Implementation also supports dilation >= 1 for stride = 1  */
+/*               and dilation = 1 for stride = 2, 4                         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 7x7xDxN                                     */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+XAI_ERR_TYPE xaiConvolve3D_S_7x7_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params *param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(xaiConvolved3D_S_7x7_S8S8IXCa2_MOD_DWH(inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolve3D_S_MxN_S8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution              */
+/*               Stride values = 1, 2 and 4 are supported                   */
+/*               Implementation also supports dilation >= 1 for stride = 1  */
+/*               and dilation = 1 for stride = 2, 4                         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 15.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+XAI_ERR_TYPE xaiConvolve3D_S_MxN_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params *param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH(inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************* end of MOD variants ***************************************/
+/*******************************************************************************************/
+#endif //if ((XCHAL_VISION_TYPE >= 6))
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOW.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOW.c
new file mode 100644
index 00000000000..1e9a385aa63
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOW.c
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#define INPUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_conv_MOW.h"
+#undef INPUT_DATA_TYPE
+
+#define INPUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_conv_MOW.h"
+#endif //if ((XCHAL_VISION_TYPE >= 6))
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOW.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOW.h
new file mode 100644
index 00000000000..eec1cedea17
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOW.h
@@ -0,0 +1,738 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER_IDT, suffix)  name ## _ ## MORPH_FNAME_SPECIFIER_IDT ## suffix
+
+#if INPUT_DATA_TYPE == UNSIGNED8BIT
+
+#define MAKE_NAME(name, suffix)  MAKE_NAME_IMPL(name, U8, suffix)
+#define MORPH_IDT_CHECK              XAI_CHECK_TILE3D_U8
+#define MORPH_IDT_SCALAR             uint8_t
+#define MORPH_IDT_2Nx8               xb_vec2Nx8U
+#define MORPH_OP_PRIME_2Nx8          IVP_LA2NX8U_PP
+#define MORPH_OP_ALIGN_LOAD_2Nx8     IVP_LV2NX8U_XP
+#define MORPH_OP_LOAD_2Nx8           IVP_LA2NX8U_XP
+#define MORPH_OP_LOAD_2Nx8_IP        IVP_LA2NX8U_IP
+#define MORPH_OP_LOAD_2Nx8_VARIABLE  IVP_LAV2NX8U_XP
+#define MORPH_OP_MULA                IVP_MULUSA2N8XR16
+#define MORPH_OP_MUL4TA              IVP_MULUS4TA2N8XR8
+#define MORPH_OP_MULQA               IVP_MULUSQA2N8XR8
+#define MORPH_OP_MULPA               IVP_MULUSPA2N8XR16
+
+#elif INPUT_DATA_TYPE == SIGNED8BIT
+
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_2Nx8
+#undef MORPH_OP_PRIME_2Nx8
+#undef MORPH_OP_ALIGN_LOAD_2Nx8
+#undef MORPH_OP_LOAD_2Nx8_IP
+#undef MORPH_OP_LOAD_2Nx8_VARIABLE
+#undef MORPH_OP_LOAD_2Nx8
+#undef MORPH_OP_MULA
+#undef MORPH_OP_MUL4TA
+#undef MORPH_OP_MULQA
+#undef MORPH_OP_MULPA
+
+#define MAKE_NAME(name, suffix)  MAKE_NAME_IMPL(name, S8, suffix)
+#define MORPH_IDT_CHECK              XAI_CHECK_TILE3D_S8
+#define MORPH_IDT_SCALAR             int8_t
+#define MORPH_IDT_2Nx8               xb_vec2Nx8
+#define MORPH_OP_PRIME_2Nx8          IVP_LA2NX8_PP
+#define MORPH_OP_ALIGN_LOAD_2Nx8     IVP_LV2NX8_XP
+#define MORPH_OP_LOAD_2Nx8           IVP_LA2NX8_XP
+#define MORPH_OP_LOAD_2Nx8_IP        IVP_LA2NX8_IP
+#define MORPH_OP_LOAD_2Nx8_VARIABLE  IVP_LAV2NX8_XP
+#define MORPH_OP_MULA                IVP_MULA2N8XR16
+#define MORPH_OP_MUL4TA              IVP_MUL4TA2N8XR8
+#define MORPH_OP_MULQA               IVP_MULQA2N8XR8
+#define MORPH_OP_MULPA               IVP_MULPA2N8XR16
+#endif
+
+/******************************************************************************************
+* MOW Stride 1 varaints
+******************************************************************************************/
+
+/******************************************************************************************
+*   MAKE_NAME(xaiConvolve3D_S_1x1j1, S8IX_MOW_WHD)
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 1x1 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 1x1 3D convolution function for U8 bit and  */
+/*               S8 bit input data with input stride equal to 1               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               CNN convolution params structure                             */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 1x1xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/******************** xaiConvolve3D_S_1x1j1_S8S8IX_MOW_WHD *********************/
+/******************** xaiConvolve3D_S_1x1j1_U8S8IX_MOW_WHD *********************/
+
+XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_1x1j1, S8IX_MOW_WHD) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params * param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(MAKE_NAME(xaiConvolved3D_S_1x1j1d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  MAKE_NAME(xaiConvolve3D_S_3x3j1, S8IX_MOW_WHD)
+*  **************************************************************************/
+
+/******************** xaiConvolve3D_S_3x3j1_S8S8IX_MOW_WHD *********************/
+/******************** xaiConvolve3D_S_3x3j1_U8S8IX_MOW_WHD *********************/
+
+XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_3x3j1, S8IX_MOW_WHD) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params * param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(MAKE_NAME(xaiConvolved3D_S_3x3j1d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+
+
+/******************************************************************************************
+*   MAKE_NAME(xaiConvolve3D_S_5x5j1, S8IX_MOW_WHD)
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 5x5 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 5x5 3D convolution function for U8 bit and  */
+/*               S8 bit input data with input stride equal to 1               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               CNN convolution params structure                             */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 5x5xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/******************** xaiConvolve3D_S_5x5j1_S8S8IX_MOW_WHD *********************/
+/******************** xaiConvolve3D_S_5x5j1_U8S8IX_MOW_WHD *********************/
+
+XAI_ERR_TYPE  MAKE_NAME(xaiConvolve3D_S_5x5j1, S8IX_MOW_WHD) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params * param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(MAKE_NAME(xaiConvolved3D_S_5x5j1d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+*   MAKE_NAME(xaiConvolve3D_S_7x7j1, S8IX_MOW_WHD)
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 7x7 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 7x7 3D convolution function for U8 bit and  */
+/*               S8 bit input data with input stride equal to 1               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               CNN convolution params structure                             */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 7x7xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/******************** xaiConvolve3D_S_7x7j1_S8S8IX_MOW_WHD *********************/
+/******************** xaiConvolve3D_S_7x7j1_U8S8IX_MOW_WHD *********************/
+
+XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_7x7j1, S8IX_MOW_WHD) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params * param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(MAKE_NAME(xaiConvolved3D_S_7x7j1d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+*   MAKE_NAME(xaiConvolve3D_S_MxNj1, S8IX_MOW_WHD)
+*  ***************************************************************************************/
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for MxN 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate MxN 3D convolution function for U8 bit and  */
+/*               S8 bit input data with input stride equal to 1               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               CNN convolution params structure                             */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is MxNxDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/******************** xaiConvolve3D_S_MxNj1_S8S8IX_MOW_WHD *********************/
+/******************** xaiConvolve3D_S_MxNj1_U8S8IX_MOW_WHD *********************/
+
+XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_MxNj1, S8IX_MOW_WHD) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params * param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(MAKE_NAME(xaiConvolved3D_S_MxNj1d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+* MOW Stride 2 varaints
+******************************************************************************************/
+
+
+/*****************************************************************************
+*  MAKE_NAME(xaiConvolve3D_S_1x1j2, S8IX_MOW_WHD)
+*  **************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 1x1 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 1x1 3D convolution function for U8 bit and  */
+/*               S8 bit input data with input stride equal to 2               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               CNN convolution params structure                             */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 1x1xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/******************** xaiConvolve3D_S_1x1j2_S8S8IX_MOW_WHD *********************/
+/******************** xaiConvolve3D_S_1x1j2_U8S8IX_MOW_WHD *********************/
+
+XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_1x1j2, S8IX_MOW_WHD) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params * param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(MAKE_NAME(xaiConvolved3D_S_1x1j2d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+*   MAKE_NAME(xaiConvolve3D_S_3x3j2, S8IX_MOW_WHD)
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 3x3 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 3x3 3D convolution function for U8 bit and  */
+/*               S8 bit input data with input stride equal to 2               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               CNN convolution params structure                             */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 3x3xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/******************** xaiConvolve3D_S_3x3j2_S8S8IX_MOW_WHD *********************/
+/******************** xaiConvolve3D_S_3x3j2_U8S8IX_MOW_WHD *********************/
+
+XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_3x3j2, S8IX_MOW_WHD) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params * param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(MAKE_NAME(xaiConvolved3D_S_3x3j2d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+*   MAKE_NAME(xaiConvolve3D_S_5x5j2, S8IX_MOW_WHD)
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 5x5 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 5x5 3D convolution function for U8 bit and  */
+/*               S8 bit input data with input stride equal to 2               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               CNN convolution params structure                             */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 5x5xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/******************** xaiConvolve3D_S_5x5j2_S8S8IX_MOW_WHD *********************/
+/******************** xaiConvolve3D_S_5x5j2_U8S8IX_MOW_WHD *********************/
+
+XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_5x5j2, S8IX_MOW_WHD) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params * param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(MAKE_NAME(xaiConvolved3D_S_5x5j2d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+*   MAKE_NAME(xaiConvolve3D_S_7x7j2, S8IX_MOW_WHD)
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 7x7 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 7x7 3D convolution function for U8 bit and  */
+/*               S8 bit input data with input stride equal to 2               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               CNN convolution params structure                             */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 7x7xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/******************** xaiConvolve3D_S_7x7j2_S8S8IX_MOW_WHD *********************/
+/******************** xaiConvolve3D_S_7x7j2_U8S8IX_MOW_WHD *********************/
+
+XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_7x7j2, S8IX_MOW_WHD) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params * param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(MAKE_NAME(xaiConvolved3D_S_7x7j2d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+*   MAKE_NAME(xaiConvolve3D_S_MxNj2, S8IX_MOW_WHD)
+*  ***************************************************************************************/
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for MxN 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate MxN 3D convolution function for U8 bit and  */
+/*               S8 bit input data with input stride equal to 2               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               CNN convolution params structure                             */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is MxNxDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/******************** xaiConvolve3D_S_MxNj2_S8S8IX_MOW_WHD *********************/
+/******************** xaiConvolve3D_S_MxNj2_U8S8IX_MOW_WHD *********************/
+
+XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_MxNj2, S8IX_MOW_WHD) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params * param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(MAKE_NAME(xaiConvolved3D_S_MxNj2d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+* MOW Stride 4 varaints
+******************************************************************************************/
+
+/******************************************************************************************
+*   MAKE_NAME(xaiConvolve3D_S_1x1j4, S8IX_MOW_WHD)
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 1x1 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 1x1 3D convolution function for U8 bit and  */
+/*               S8 bit input data with input stride equal to 4               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               CNN convolution params structure                             */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 1x1xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/******************** xaiConvolve3D_S_1x1j4_S8S8IX_MOW_WHD *********************/
+/******************** xaiConvolve3D_S_1x1j4_U8S8IX_MOW_WHD *********************/
+
+XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_1x1j4, S8IX_MOW_WHD) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params * param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(MAKE_NAME(xaiConvolved3D_S_1x1j4d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+*   MAKE_NAME(xaiConvolve3D_S_3x3j4, S8IX_MOW_WHD)
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 3x3 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 3x3 3D convolution function for U8 bit and  */
+/*               S8 bit input data with input stride equal to 4               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               CNN convolution params structure                             */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 3x3xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/******************** xaiConvolve3D_S_3x3j4_S8S8IX_MOW_WHD *********************/
+/******************** xaiConvolve3D_S_3x3j4_U8S8IX_MOW_WHD *********************/
+
+XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_3x3j4, S8IX_MOW_WHD) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params * param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(MAKE_NAME(xaiConvolved3D_S_3x3j4d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+*   MAKE_NAME(xaiConvolve3D_S_5x5j4, S8IX_MOW_WHD)
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 5x5 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 5x5 3D convolution function for U8 bit and  */
+/*               S8 bit input data with input stride equal to 4               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               CNN convolution params structure                             */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 5x5xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/******************** xaiConvolve3D_S_5x5j4_S8S8IX_MOW_WHD *********************/
+/******************** xaiConvolve3D_S_5x5j4_U8S8IX_MOW_WHD *********************/
+
+XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_5x5j4, S8IX_MOW_WHD) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params * param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(MAKE_NAME(xaiConvolved3D_S_5x5j4d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+*   MAKE_NAME(xaiConvolve3D_S_7x7j4, S8IX_MOW_WHD)
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 7x7 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 7x7 3D convolution function for U8 bit and  */
+/*               S8 bit input data with input stride equal to 4               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               CNN convolution params structure                             */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 7x7xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/******************** xaiConvolve3D_S_7x7j4_S8S8IX_MOW_WHD *********************/
+/******************** xaiConvolve3D_S_7x7j4_U8S8IX_MOW_WHD *********************/
+
+XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_7x7j4, S8IX_MOW_WHD) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params * param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(MAKE_NAME(xaiConvolved3D_S_7x7j4d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+
+/******************************************************************************************
+*   MAKE_NAME(xaiConvolve3D_S_MxNj4, S8IX_MOW_WHD)
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for MxN 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate MxN 3D convolution function for U8 bit and  */
+/*               S8 bit input data with input stride equal to 4               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               CNN convolution params structure                             */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is MxNxDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/******************** xaiConvolve3D_S_MxNj4_S8S8IX_MOW_WHD *********************/
+/******************** xaiConvolve3D_S_MxNj4_U8S8IX_MOW_WHD *********************/
+
+XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_MxNj4, S8IX_MOW_WHD) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params * param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+
+  return(MAKE_NAME(xaiConvolved3D_S_MxNj4d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param));
+
+  return(XAI_ERROR_STATUS());
+}
+
+/********************************** end of MOW variants ************************************/
+/*******************************************************************************************/
+#endif //if ((XCHAL_VISION_TYPE >= 6))
+
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_SO.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_SO.c
new file mode 100644
index 00000000000..366ad6b8b2f
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_SO.c
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#undef DILATED_SO_VQ_CONV
+
+#define INPUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_conv_SO.h"
+#undef INPUT_DATA_TYPE
+
+#define INPUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_conv_SO.h"
+#endif //if ((XCHAL_VISION_TYPE >= 6))
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_SO.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_SO.h
new file mode 100644
index 00000000000..1cd413bba04
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_SO.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER_IDT, suffix)  name ## _ ## MORPH_FNAME_SPECIFIER_IDT ## suffix
+
+#if INPUT_DATA_TYPE == UNSIGNED8BIT
+
+#define MAKE_NAME(name, suffix)  MAKE_NAME_IMPL(name, U8, suffix)
+#define MORPH_IDT_CHECK              XAI_CHECK_TILE3D_U8
+#define MORPH_IDT_SCALAR             uint8_t
+#define MORPH_IDT_2Nx8               xb_vec2Nx8U
+#define MORPH_OP_PRIME_2Nx8          IVP_LA2NX8U_PP
+#define MORPH_OP_ALIGN_LOAD_2Nx8     IVP_LV2NX8U_XP
+#define MORPH_OP_LOAD_2Nx8           IVP_LA2NX8U_XP
+#define MORPH_OP_LOAD_2Nx8_IP        IVP_LA2NX8U_IP
+#define MORPH_OP_LOAD_2Nx8_VARIABLE  IVP_LAV2NX8U_XP
+#define MORPH_OP_MULA                IVP_MULUSA2N8XR16
+#define MORPH_OP_MULPA               IVP_MULUSPA2NX8
+
+
+#elif INPUT_DATA_TYPE == SIGNED8BIT
+
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_2Nx8
+#undef MORPH_OP_PRIME_2Nx8
+#undef MORPH_OP_ALIGN_LOAD_2Nx8
+#undef MORPH_OP_LOAD_2Nx8_IP
+#undef MORPH_OP_LOAD_2Nx8_VARIABLE
+#undef MORPH_OP_LOAD_2Nx8
+#undef MORPH_OP_MULA
+#undef MORPH_OP_MULPA
+
+
+#define MAKE_NAME(name, suffix)  MAKE_NAME_IMPL(name, S8, suffix)
+#define MORPH_IDT_CHECK              XAI_CHECK_TILE3D_S8
+#define MORPH_IDT_SCALAR             int8_t
+#define MORPH_IDT_2Nx8               xb_vec2Nx8
+#define MORPH_OP_PRIME_2Nx8          IVP_LA2NX8_PP
+#define MORPH_OP_ALIGN_LOAD_2Nx8     IVP_LV2NX8_XP
+#define MORPH_OP_LOAD_2Nx8           IVP_LA2NX8_XP
+#define MORPH_OP_LOAD_2Nx8_IP        IVP_LA2NX8_IP
+#define MORPH_OP_LOAD_2Nx8_VARIABLE  IVP_LAV2NX8_XP
+#define MORPH_OP_MULA                IVP_MULA2N8XR16
+#define MORPH_OP_MULPA               IVP_MULPA2NX8
+#endif
+
+/******************************************************************************************
+* SO(Single output) variants
+******************************************************************************************/
+
+/***************************************************************************/
+/*  xaiConvolve3D_S_MxN_S8_SO_DWH/xaiConvolve3D_S_MxN_U8_SO_DWH      */
+/***************************************************************************/
+
+/***********************************************************************/
+/* Description : P6 Optimized implementation of 3D convolution in SO   */
+/*               Vectorization Approach.                               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,         */
+/*               CNN convolution params structure                      */
+/* Outputs     : XI Error Code                                         */
+/* InOuts      : Output Tile                                           */
+/* Assumptions : InData is S8/U8                                       */
+/*               CoeffData is S8                                       */
+/*               OutData is S8 / U8 / S16                              */
+/*               Kernel Size is close to that of Input Size.           */
+/*               Input and Output is in DWH format.                    */
+/*               Coeff is in DWHN format.                              */
+/*               dim1Size of Input Tile is equal to dim1Pitch of Input */
+/*               Tile.                                                 */
+/***********************************************************************/
+
+/******************* xaiConvolve3D_S_MxN_S8S8IX_SO_DWH ********************/
+/******************* xaiConvolve3D_S_MxN_U8S8IX_SO_DWH ********************/
+
+XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_MxN, S8IX_SO_DWH) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  xai_cnn_conv_params * param
+  )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(param);
+  }
+
+  XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1);
+  return(MAKE_NAME(xaiConvolved3D_S_MxN, S8IX_SO_DWH) (inTile, coeffTile, biasArray, outTile, param));
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************** end of SO variants *****************************************/
+/*******************************************************************************************/
+#endif //if ((XCHAL_VISION_TYPE >= 6))
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_VQ.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_VQ.c
new file mode 100644
index 00000000000..5b173b24002
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_VQ.c
@@ -0,0 +1,1371 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+/******************************************************************************
+ * 3D VQ convolution general version for dilation functions
+ * Calls a specific dilated VQ convolution function based on parameters
+ *****************************************************************************/
+XAI_ERR_TYPE xaiConvolvedVQ3D(const xai_pTile3D inTile,
+                              const xai_pTile4D coeffTile,
+                              const xai_pArray biasArray,
+                              const xai_pArray outputScaleArray,
+                              xai_pTile3D outTile,
+                              const xai_cnn_conv_params *param)
+{
+  /* The arguments inTile, coeffTile and param are used by xaiGetConvolved3DVariant
+   * helper function, to derive the appropriate convolution variant */
+  if ((!inTile) || (!coeffTile) || (!param))
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  /* Function Pointer */
+  typedef XAI_ERR_TYPE (*fConvdVQPtr)(const xai_pTile3D inTile,
+                                      const xai_pTile4D coeffTile,
+                                      const xai_pArray biasArray,
+                                      const xai_pArray outputScaleArray,
+                                      xai_pTile3D outTile,
+                                      const xai_cnn_conv_params* param);
+
+  /* Getting the function pointer of the convolution variant using xaiGetConvolved3DVariant function*/
+  fConvdVQPtr xaiConvolveVQ3D_opt =
+    (fConvdVQPtr) xaiGetConvolvedVQ3DVariant(inTile, coeffTile, biasArray, outputScaleArray, outTile, param);
+
+  if (xaiConvolveVQ3D_opt == NULL)
+  {
+    return(XAI_ERR_NO_VARIANT);
+  }
+  else
+  {
+    return(xaiConvolveVQ3D_opt(inTile, coeffTile, biasArray, outputScaleArray, outTile, param));
+  }
+}
+
+/*********************************************************************************************
+* 3D VQ dilated convolution helper function
+* Returns the function pointer of a specific dilated convolution variant based on parameters
+*********************************************************************************************/
+XAI_ERR_TYPE *xaiGetConvolvedVQ3DVariant(const xai_pTile3D inTile,
+                                         const xai_pTile4D coeffTile,
+                                         const xai_pArray biasArray,
+                                         const xai_pArray outputScaleArray,
+                                         xai_pTile3D outTile,
+                                         const xai_cnn_conv_params *param)
+{
+  if ((!inTile) || (!coeffTile) || (!param))
+  {
+    return(NULL);
+  }
+
+  uint8_t stride;
+  uint8_t dilation;
+  xai_cnn_data_order coeffOrder = XAI_TILE4D_GET_DATA_ORDER(coeffTile);
+
+  int32_t kWidth, kHeight;
+  xai_cnn_data_order inOrder = XAI_TILE3D_GET_DATA_ORDER(inTile);
+
+  if (coeffOrder == XAI_NDWH)
+  {
+    /* MOD variants */
+    kWidth  = XAI_TILE4D_GET_DIM3(coeffTile);
+    kHeight = XAI_TILE4D_GET_DIM4(coeffTile);
+
+    if (inOrder == XAI_WHD)
+    {
+      if (kWidth == 1 && kHeight == 1)
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH);
+      }
+      else if (kWidth == 2 && kHeight == 2)
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_2x2_S8S8IXCa2_MOD_WHD_DWH);
+      }
+      else if (kWidth == 3 && kHeight == 3)
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH);
+      }
+      else if (kWidth == 4 && kHeight == 4)
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_4x4_S8S8IXCa2_MOD_WHD_DWH);
+      }
+      else if (kWidth == 5 && kHeight == 5)
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH);
+      }
+      else if (kWidth == 7 && kHeight == 7)
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH);
+      }
+      else
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH);
+      }
+    }
+    else if (inOrder == XAI_DWH)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) && XAI_TILE4D_CHECK_TYPE(coeffTile, XAI_S8))
+      {
+        if (XAI_CNN_CONV_GET_STRIDEX(param) != XAI_CNN_CONV_GET_STRIDEY(param))
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH);
+        }
+        else if (XAI_CNN_CONV_GET_STRIDE(param) != 1 && XAI_CNN_CONV_GET_STRIDE(param) != 2 \
+                 && XAI_CNN_CONV_GET_STRIDE(param) != 4)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH);
+        }
+        else if (kWidth == 1 && kHeight == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1_S8S8IXCa2_MOD_DWH);
+        }
+        else if (kWidth == 2 && kHeight == 2)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_2x2_S8S8IXCa2_MOD_DWH);
+        }
+        else if (kWidth == 3 && kHeight == 3)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3_S8S8IXCa2_MOD_DWH);
+        }
+        else if (kWidth == 4 && kHeight == 4)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_4x4_S8S8IXCa2_MOD_DWH);
+        }
+        else if (kWidth == 5 && kHeight == 5)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5_S8S8IXCa2_MOD_DWH);
+        }
+        else if (kWidth == 7 && kHeight == 7)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7_S8S8IXCa2_MOD_DWH);
+        }
+        else
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH);
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8) && XAI_TILE4D_CHECK_TYPE(coeffTile, XAI_S8))
+      {
+        if (kWidth == 1 && kHeight == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1_U8S8IXCa2_MOD_DWH);
+        }
+        else
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH);
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16) && XAI_TILE4D_CHECK_TYPE(coeffTile, XAI_S16))
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH);
+      }
+    }
+  }
+  else if (coeffOrder == XAI_WHDN)
+  {
+    /* MOW variants */
+    stride   = XAI_CNN_CONV_GET_STRIDE(param);
+    dilation = XAI_CNN_CONV_GET_DILATION(param);
+    kWidth   = XAI_TILE4D_GET_DIM1(coeffTile);
+    kHeight  = XAI_TILE4D_GET_DIM2(coeffTile);
+    if (kWidth == 1 && kHeight == 1)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1j1d1_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1j2d1_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1j4d1_S8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1j1d1_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1j2d1_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1j4d1_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD);
+          }
+        }
+      }
+    }
+    else if (kWidth == 2 && kHeight == 2)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_2x2j1d1_S8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_2x2j1d1_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD);
+          }
+        }
+        if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD);
+          }
+        }
+        if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD);
+          }
+        }
+      }
+    }
+    else if (kWidth == 3 && kHeight == 3)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j1d1_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j1d2_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j1d4_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j2d1_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j4d1_S8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j1d1_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j1d2_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j1d4_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j2d1_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j4d1_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD);
+          }
+        }
+      }
+    }
+    else if (kWidth == 4 && kHeight == 4)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_4x4j1d1_S8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_4x4j1d1_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD);
+          }
+        }
+        if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD);
+          }
+        }
+        if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD);
+          }
+        }
+      }
+    }
+    else if (kWidth == 5 && kHeight == 5)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j1d1_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j1d2_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j1d4_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j2d1_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j4d1_S8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j1d1_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j1d2_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j1d4_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j2d1_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j4d1_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD);
+          }
+        }
+      }
+    }
+    else if (kWidth == 7 && kHeight == 7)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j1d1_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j1d2_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j1d4_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j2d1_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j4d1_S8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j1d1_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j1d2_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j1d4_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j2d1_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j4d1_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD);
+          }
+        }
+      }
+    }
+    else
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d2_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d4_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d2_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d4_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_U8S8IX_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 2)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD);
+          }
+        }
+        else if (stride == 4)
+        {
+          if (dilation == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD);
+          }
+        }
+      }
+    }
+  }
+  else if (coeffOrder == XAI_DWHN)
+  {
+    /* SO variants */
+    if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+    {
+      return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_S8S8IX_SO_DWH);
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+    {
+      return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_U8S8IX_SO_DWH);
+    }
+  }
+//#else
+#if 0
+  xai_cnn_data_order inOrder = XAI_TILE3D_GET_DATA_ORDER(inTile);
+
+  if (coeffOrder == XAI_DWHN)
+  {
+    /* SO variants */
+    if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+    {
+      return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_S8S8IX_SO_DWH);
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+    {
+      return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_U8S8IX_SO_DWH);
+    }
+  }
+  else if (coeffOrder == XAI_NDWH)
+  {
+    if (inOrder == XAI_DWH)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+      {
+        return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH);
+      }
+    }
+  }
+  else if (coeffOrder == XAI_WHDN)
+  {
+    /* MOW variants */
+    stride   = XAI_CNN_CONV_GET_STRIDE(param);
+    dilation = XAI_CNN_CONV_GET_DILATION(param);
+
+    if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+    {
+      if (stride == 1)
+      {
+        if (dilation == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD);
+        }
+      }
+      else if (stride == 2)
+      {
+        if (dilation == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD);
+        }
+      }
+      else if (stride == 4)
+      {
+        if (dilation == 1)
+        {
+          return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD);
+        }
+      }
+    }
+  }
+#endif
+
+  return(NULL);
+}
+
+/******************************************************************************
+ * Depthwise VQ convolution general version
+ * Calls a specific depthwise VQ convolution function based in parameters
+ * This is just a dummy function. Actual function will have proper checking
+ *****************************************************************************/
+XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D(const xai_pTile3D inTile,
+                                      const xai_pTile3D coeffTile,
+                                      const xai_pArray biasArray,
+                                      const xai_pArray outputScaleArray,
+                                      xai_pTile3D outTile,
+                                      const xai_cnn_conv_params *param)
+{
+  /* The arguments inTile, coeffTile and param are used by xaiGetDepthwiseConvolve2DVariant
+   * helper function, to derive the appropriate convolution variant */
+  if ((!inTile) || (!coeffTile) || (!param))
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  /* Function Pointer */
+  typedef XAI_ERR_TYPE (*fDepthwiseConvVQPtr)(const xai_pTile3D inTile,
+                                              const xai_pTile3D coeffTile,
+                                              const xai_pArray biasArray,
+                                              const xai_pArray outputScaleArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_conv_params* param);
+
+  /* Getting the function pointer of the VQ depthwise convolution variant using */
+  /* xaiGetDepthwiseConvolve2DVariant function                                   */
+  fDepthwiseConvVQPtr xaiDepthwiseConvolveVQ2D_opt = (fDepthwiseConvVQPtr) xaiGetDepthwiseConvolveVQ2DVariant(inTile,
+                                                                                                              coeffTile,
+                                                                                                              biasArray,
+                                                                                                              outputScaleArray,
+                                                                                                              outTile,
+                                                                                                              param);
+
+  if (xaiDepthwiseConvolveVQ2D_opt == NULL)
+  {
+    return(XAI_ERR_NO_VARIANT);
+  }
+  else
+  {
+    return(xaiDepthwiseConvolveVQ2D_opt(inTile, coeffTile, biasArray, outputScaleArray,
+                                        outTile, param));
+  }
+}
+
+/**************************************************************************************
+* Depthwise VQ convolution helper function
+* Returns the function pointer of a specific depthwise VQ convolution variant based on parameters
+**************************************************************************************/
+XAI_ERR_TYPE *xaiGetDepthwiseConvolveVQ2DVariant(const xai_pTile3D inTile,
+                                                 const xai_pTile3D coeffTile,
+                                                 const xai_pArray biasArray,
+                                                 const xai_pArray outputScaleArray,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_conv_params *param)
+{
+  if ((!inTile) || (!coeffTile) || (!outTile) || (!param))
+  {
+    return(NULL);
+  }
+  if (!(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile)))
+  {
+    return(NULL);
+  }
+
+  xai_cnn_data_order inOrder    = XAI_TILE3D_GET_DATA_ORDER(inTile);
+  xai_cnn_data_order coeffOrder = XAI_TILE3D_GET_DATA_ORDER(coeffTile);
+  int32_t kWidth, kHeight;
+
+  if (coeffOrder == XAI_DWH)
+  {
+    /* MOD variants */
+    kWidth  = XAI_TILE3D_GET_DIM2(coeffTile);
+    kHeight = XAI_TILE3D_GET_DIM3(coeffTile);
+    if (inOrder == XAI_DWH)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (XAI_CNN_CONV_GET_STRIDEX(param) != XAI_CNN_CONV_GET_STRIDEY(param))
+        {
+          if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxN_S8S8IXCa2_MOD_DWH);
+          }
+          else
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxN_U8S8IXCa2_MOD_DWH);
+          }
+        }
+        else if (XAI_CNN_CONV_GET_STRIDE(param) != 1 && XAI_CNN_CONV_GET_STRIDE(param) != 2 \
+                 && XAI_CNN_CONV_GET_STRIDE(param) != 4)
+        {
+          if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxN_S8S8IXCa2_MOD_DWH);
+          }
+          else
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxN_U8S8IXCa2_MOD_DWH);
+          }
+        }
+        else if (kWidth == 3 && kHeight == 3)
+        {
+          if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_3x3_S8S8IXCa2_MOD_DWH);
+          }
+          else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_3x3_U8S8IXCa2_MOD_DWH);
+          }
+        }
+        else if (kWidth == 5 && kHeight == 5)
+        {
+          if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_5x5_S8S8IXCa2_MOD_DWH);
+          }
+          else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_5x5_U8S8IXCa2_MOD_DWH);
+          }
+        }
+        else if (kWidth == 7 && kHeight == 7 && XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_7x7_S8S8IXCa2_MOD_DWH);
+        }
+        else
+        {
+          if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxN_S8S8IXCa2_MOD_DWH);
+          }
+          else
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxN_U8S8IXCa2_MOD_DWH);
+          }
+        }
+      } /* if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) */
+      else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16))
+      {
+        return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxN_S16S16I16_MOD_DWH);
+      } /* if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16))*/
+    }   /* if(inOrder == XAI_DWH) */
+  }     /* if (coeffOrder == XAI_DWH) */
+  else if (coeffOrder == XAI_WHD)
+  {
+    /* MOW variants */
+    uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param);
+    kWidth  = XAI_TILE3D_GET_DIM1(coeffTile);
+    kHeight = XAI_TILE3D_GET_DIM2(coeffTile);
+    if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+    {
+      /* MOW variants */
+      if (kWidth == 3 && kHeight == 3)
+      {
+        if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+        {
+          if (stride == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_3x3j1_S8S8IX_MOW_WHD);
+          }
+          else if (stride == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_3x3j2_S8S8IX_MOW_WHD);
+          }
+          else if (stride == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_3x3j4_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+        {
+          if (stride == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_3x3j1_U8S8IX_MOW_WHD);
+          }
+          else if (stride == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_3x3j2_U8S8IX_MOW_WHD);
+          }
+          else if (stride == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_3x3j4_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (kWidth == 5 && kHeight == 5)
+      {
+        if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+        {
+          if (stride == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_5x5j1_S8S8IX_MOW_WHD);
+          }
+          else if (stride == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_5x5j2_S8S8IX_MOW_WHD);
+          }
+          else if (stride == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_5x5j4_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+        {
+          if (stride == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_5x5j1_U8S8IX_MOW_WHD);
+          }
+          else if (stride == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_5x5j2_U8S8IX_MOW_WHD);
+          }
+          else if (stride == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_5x5j4_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (kWidth == 7 && kHeight == 7)
+      {
+        if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+        {
+          if (stride == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_7x7j1_S8S8IX_MOW_WHD);
+          }
+          else if (stride == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_7x7j2_S8S8IX_MOW_WHD);
+          }
+          else if (stride == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_7x7j4_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+        {
+          if (stride == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_7x7j1_U8S8IX_MOW_WHD);
+          }
+          else if (stride == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_7x7j2_U8S8IX_MOW_WHD);
+          }
+          else if (stride == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_7x7j4_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else
+      {
+        if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+        {
+          if (stride == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj1_S8S8IX_MOW_WHD);
+          }
+          else if (stride == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj2_S8S8IX_MOW_WHD);
+          }
+          else if (stride == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj4_S8S8IX_MOW_WHD);
+          }
+        }
+        else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+        {
+          if (stride == 1)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj1_U8S8IX_MOW_WHD);
+          }
+          else if (stride == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj2_U8S8IX_MOW_WHD);
+          }
+          else if (stride == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj4_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+/* #if XCHAL_VISION_QUAD_MAC_TYPE != 0 */
+    } /* if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) */
+    else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16))
+    {
+      if (stride == 1)
+      {
+        return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj1_S16S16I16_MOW_WHD);
+      }
+      else if (stride == 2)
+      {
+        return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj2_S16S16I16_MOW_WHD);
+      }
+      else if (stride == 4)
+      {
+        return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj4_S16S16I16_MOW_WHD);
+      }
+    } /* if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)) */
+  }   /*  if(coeffOrder == XAI_WHD) */
+  return(NULL);
+}
+
+/******************************************************************************
+ * Depthwise dilated VQ convolution general version
+ * Calls a specific depthwise VQ convolution function based in parameters
+ * This is just a dummy function. Actual function will have proper checking
+ *****************************************************************************/
+XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D(const xai_pTile3D inTile,
+                                       const xai_pTile3D coeffTile,
+                                       const xai_pArray biasArray,
+                                       const xai_pArray outputScaleArray,
+                                       xai_pTile3D outTile,
+                                       const xai_cnn_depthwiseDilatedConv_params *param)
+{
+  /* The arguments inTile, coeffTile and param are used by xaiGetDepthwiseConvolve2DVariant
+   * helper function, to derive the appropriate convolution variant */
+  if ((!inTile) || (!coeffTile) || (!param))
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  /* Function Pointer */
+  typedef XAI_ERR_TYPE (*fDepthwiseConvdVQPtr)(const xai_pTile3D inTile,
+                                               const xai_pTile3D coeffTile,
+                                               const xai_pArray biasArray,
+                                               const xai_pArray outputScaleArray,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_depthwiseDilatedConv_params* param);
+
+  /* Getting the function pointer of the VQ depthwise dilated convolution variant */
+  /*  usingxaiGetDepthwiseConvolve2DVariant function                               */
+  fDepthwiseConvdVQPtr xaiDepthwiseConvolvedVQ2D_opt = (fDepthwiseConvdVQPtr) xaiGetDepthwiseConvolvedVQ2DVariant(inTile,
+                                                                                                                  coeffTile,
+                                                                                                                  biasArray,
+                                                                                                                  outputScaleArray,
+                                                                                                                  outTile,
+                                                                                                                  param);
+
+  if (xaiDepthwiseConvolvedVQ2D_opt == NULL)
+  {
+    return(XAI_ERR_NO_VARIANT);
+  }
+  else
+  {
+    return(xaiDepthwiseConvolvedVQ2D_opt(inTile, coeffTile, biasArray, outputScaleArray,
+                                         outTile, param));
+  }
+}
+
+/**************************************************************************************
+* Depthwise dilated VQ convolution helper function
+* Returns the function pointer of a specific depthwise dilated VQ convolution variant
+* based on parameters
+**************************************************************************************/
+XAI_ERR_TYPE *xaiGetDepthwiseConvolvedVQ2DVariant(const xai_pTile3D inTile,
+                                                  const xai_pTile3D coeffTile,
+                                                  const xai_pArray biasArray,
+                                                  const xai_pArray outputScaleArray,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_depthwiseDilatedConv_params *param)
+{
+  if ((!inTile) || (!coeffTile) || (!param))
+  {
+    return(NULL);
+  }
+
+  xai_cnn_data_order inOrder    = XAI_TILE3D_GET_DATA_ORDER(inTile);
+  xai_cnn_data_order coeffOrder = XAI_TILE3D_GET_DATA_ORDER(coeffTile);
+#if (XCHAL_HAVE_SUPERGATHER == 0)
+  int32_t depthMultiplier = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DEPTH_MULTIPLIER(param);
+#endif
+  uint8_t stride;
+  uint8_t dilation;
+
+  int32_t kWidth, kHeight;
+  if (coeffOrder == XAI_DWH)
+  {
+    /* MOD variants */
+    kWidth  = XAI_TILE3D_GET_DIM2(coeffTile);
+    kHeight = XAI_TILE3D_GET_DIM3(coeffTile);
+    if (inOrder == XAI_DWH)
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_MxN_U8S8IX_MOD_DWH);
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+#if (XCHAL_HAVE_SUPERGATHER == 0)
+        if (kWidth == 3 && kHeight == 3 && depthMultiplier != 8)
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_3x3_S8S8IX_MOD_DWH);
+        }
+        else if (kWidth == 5 && kHeight == 5 && depthMultiplier != 8)
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_5x5_S8S8IX_MOD_DWH);
+        }
+        else if (kWidth == 7 && kHeight == 7 && depthMultiplier != 8)
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_7x7_S8S8IX_MOD_DWH);
+        }
+#else
+        if (kWidth == 3 && kHeight == 3)
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_3x3_S8S8IX_MOD_DWH);
+        }
+        else if (kWidth == 5 && kHeight == 5)
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_5x5_S8S8IX_MOD_DWH);
+        }
+        else if (kWidth == 7 && kHeight == 7)
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_7x7_S8S8IX_MOD_DWH);
+        }
+#endif
+        else
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_MxN_S8S8IX_MOD_DWH);
+        }
+      }
+      else /* (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) */
+      {
+        if (kWidth == 3 && kHeight == 3)
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_3x3_S16S16I16_MOD_DWH);
+        }
+        else if (kWidth == 5 && kHeight == 5)
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_5x5_S16S16I16_MOD_DWH);
+        }
+        else
+        {
+          return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_MxN_S16S16I16_MOD_DWH);
+        }
+      }
+    }
+  }
+  /*else*/ if (coeffOrder == XAI_WHD)
+  {
+    /* MOW variants */
+
+    stride   = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param);
+    dilation = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param);
+//#endif
+    /*if(kWidth == 3 && kHeight == 3)
+       {
+       if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+       {
+        if(stride == 1)
+        {
+          if(dilation == 2)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_3x3j1d2_S8S8IX_MOW_WHD);
+          }
+          else if(dilation == 4)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_3x3j1d4_S8S8IX_MOW_WHD);
+           }
+        }
+       }
+       else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+       {
+        if(stride == 1)
+        {
+          if(dilation == 2)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_3x3j1d2_U8S8IX_MOW_WHD);
+          }
+          else if(dilation == 4)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_3x3j1d4_U8S8IX_MOW_WHD);
+           }
+        }
+       }
+       }
+       else if(kWidth == 5 && kHeight == 5)
+       {
+       if (xaiTile3DCheckType(inTile, XAI_S8))
+       {
+        if(stride == 1)
+        {
+          if(dilation == 2)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_5x5j1d2_S8S8IX_MOW_WHD);
+          }
+          else if(dilation == 4)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_5x5j1d4_S8S8IX_MOW_WHD);
+          }
+        }
+       }
+       else if (xaiTile3DCheckType(inTile, XAI_U8))
+       {
+        if(stride == 1)
+        {
+          if(dilation == 2)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_5x5j1d2_U8S8IX_MOW_WHD);
+          }
+          else if(dilation == 4)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_5x5j1d4_U8S8IX_MOW_WHD);
+          }
+        }
+       }
+       }
+       else if(kWidth == 7 && kHeight == 7)
+       {
+       if (xaiTile3DCheckType(inTile, XAI_S8))
+       {
+        if(stride == 1)
+        {
+          if(dilation == 2)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_7x7j1d2_S8S8IX_MOW_WHD);
+          }
+          else if(dilation == 4)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_7x7j1d4_S8S8IX_MOW_WHD);
+           }
+        }
+       }
+       else if (xaiTile3DCheckType(inTile, XAI_U8))
+       {
+        if(stride == 1)
+        {
+          if(dilation == 2)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_7x7j1d2_U8S8IX_MOW_WHD);
+          }
+          else if(dilation == 4)
+          {
+            return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_7x7j1d4_U8S8IX_MOW_WHD);
+          }
+        }
+       }
+       }
+       else*/
+    {
+      if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_MxNj1d2_S8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_MxNj1d4_S8S8IX_MOW_WHD);
+          }
+        }
+      }
+      else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+      {
+        if (stride == 1)
+        {
+          if (dilation == 2)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_MxNj1d2_U8S8IX_MOW_WHD);
+          }
+          else if (dilation == 4)
+          {
+            return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_MxNj1d4_U8S8IX_MOW_WHD);
+          }
+        }
+      }
+//#endif
+    }
+  }
+  return(NULL);
+}
+#endif //if ((XCHAL_VISION_TYPE >= 6))
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_AsymQ_S8IX.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_AsymQ_S8IX.h
new file mode 100644
index 00000000000..22a248658b2
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_AsymQ_S8IX.h
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2024 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#undef MAKE_NAME_IMPL
+#undef MAKE_NAME
+#undef MORPH_ODT_CHECK_TILE3D
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_OP_SA_IP
+#undef MORPH_OP_SAV_XP
+#undef MORPH_OP_SAPOS_FP
+#undef MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ
+
+#if OUTPUT_DATA_TYPE == SIGNED8BIT
+
+#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER)  name ## _ ## MORPH_FNAME_SPECIFIER
+#define MAKE_NAME(name)                              MAKE_NAME_IMPL(name, S8S8)
+#define MORPH_ODT_CHECK_TILE3D  XAI_CHECK_TILE3D_S8
+#define MORPH_ODT_SCALAR        int8_t
+#define MORPH_ODT_VECTOR        xb_vecNx8
+#define MORPH_OP_SA_IP          IVP_SANX8S_IP
+#define MORPH_OP_SAV_XP         IVP_SAVNX8S_XP
+#define MORPH_OP_SAPOS_FP       IVP_SAPOSNX8S_FP
+
+#define MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut, vecAcc, shift)  { \
+    vecOut = IVP_PACKVRNX48(vecAcc, shift);                           \
+}
+
+#elif OUTPUT_DATA_TYPE == UNSIGNED8BIT
+
+#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER)  name ## _ ## MORPH_FNAME_SPECIFIER
+#define MAKE_NAME(name)                              MAKE_NAME_IMPL(name, S8U8)
+#define MORPH_ODT_CHECK_TILE3D  XAI_CHECK_TILE3D_U8
+#define MORPH_ODT_SCALAR        uint8_t
+#define MORPH_ODT_VECTOR        xb_vecNx8U
+#define MORPH_OP_SA_IP          IVP_SANX8U_IP
+#define MORPH_OP_SAV_XP         IVP_SAVNX8U_XP
+#define MORPH_OP_SAPOS_FP       IVP_SAPOSNX8U_FP
+
+#define MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut, vecAcc, shift)  {                  \
+    vecOut = IVP_PACKVRNX48(vecAcc, shift);                                            \
+    vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) UCHAR_MAX), (xb_vecNx16) 0); \
+}
+
+#elif OUTPUT_DATA_TYPE == SIGNED16BIT
+
+#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER)  name ## _ ## MORPH_FNAME_SPECIFIER
+#define MAKE_NAME(name)                              MAKE_NAME_IMPL(name, S8S16)
+#define MORPH_ODT_CHECK_TILE3D  XAI_CHECK_TILE3D_S16
+#define MORPH_ODT_SCALAR        int16_t
+#define MORPH_ODT_VECTOR        xb_vecNx16
+#define MORPH_OP_SA_IP          IVP_SANX16_IP
+#define MORPH_OP_SAV_XP         IVP_SAVNX16_XP
+#define MORPH_OP_SAPOS_FP       IVP_SAPOSNX16_FP
+
+#define MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut, vecAcc, shift)  { \
+    vecOut = IVP_PACKVRNX48(vecAcc, shift);                           \
+}
+
+#elif OUTPUT_DATA_TYPE == UNSIGNED16BIT
+
+#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER)  name ## _ ## MORPH_FNAME_SPECIFIER
+#define MAKE_NAME(name)                              MAKE_NAME_IMPL(name, S8U16)
+#define MORPH_ODT_CHECK_TILE3D  XAI_CHECK_TILE3D_U16
+#define MORPH_ODT_SCALAR        uint16_t
+#define MORPH_ODT_VECTOR        xb_vecNx16U
+#define MORPH_OP_SA_IP          IVP_SANX16U_IP
+#define MORPH_OP_SAV_XP         IVP_SAVNX16U_XP
+#define MORPH_OP_SAPOS_FP       IVP_SAPOSNX16U_FP
+
+#define MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut, vecAcc, shift)  {                                      \
+    xb_vecN_2x32v hvecAccEven = IVP_PACKVRNX48_0(vecAcc, shift);                                           \
+    xb_vecN_2x32v hvecAccOdd  = IVP_PACKVRNX48_1(vecAcc, shift);                                           \
+    hvecAccEven = IVP_MAXN_2X32(IVP_MINN_2X32(hvecAccEven, (xb_vecN_2x32v) USHRT_MAX), (xb_vecN_2x32v) 0); \
+    hvecAccOdd  = IVP_MAXN_2X32(IVP_MINN_2X32(hvecAccOdd, (xb_vecN_2x32v) USHRT_MAX), (xb_vecN_2x32v) 0);  \
+    xb_vecNx16U vecAccEven = IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(hvecAccEven));                   \
+    xb_vecNx16U vecAccOdd  = IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(hvecAccOdd));                    \
+    vecOut = IVP_SELNX16UI(vecAccOdd, vecAccEven, IVP_SELI_16B_INTERLEAVE_1_EVEN);                         \
+}
+#endif
+
+/*********************** xaiDataConversion3D_AsymQ_S8IX ************************/
+/* Description : P6 implementation for conversion from either of the following */
+/*               1) S8_SYM to S8_ASYM                                          */
+/*               2) S8_ASYM to S8_SYM                                          */
+/*               3) S8_ASYM to S8_ASYM                                         */
+/*               4) S8_ASYM to U8_SYM                                          */
+/*               5) S8_ASYM to S16_SYM                                         */
+/*               6) S8_ASYM to U16_SYM                                         */
+/* Inputs      : Input Tile, fixUp, scale, shift                               */
+/* Outputs     : XI Error Code                                                 */
+/* InOuts      : Output Tile                                                   */
+/* Assumptions : InData is signed 8bit                                         */
+/*******************************************************************************/
+
+/********************* xaiDataConversion3D_AsymQ_S8S8  *************************/
+/********************* xaiDataConversion3D_AsymQ_S8U8  *************************/
+/********************* xaiDataConversion3D_AsymQ_S8S16 *************************/
+/********************* xaiDataConversion3D_AsymQ_S8U16 *************************/
+
+XAI_ERR_TYPE MAKE_NAME (xaiDataConversion3D_AsymQ)(const xai_pTile3D inTile,
+                                                   xai_pTile3D outTile,
+                                                   const int16_t fixUp,
+                                                   const uint16_t scale,
+                                                   const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    MORPH_ODT_CHECK_TILE3D(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 24", shift);
+    XAI_CHECK_ERROR((fixUp >= SHRT_MIN) && (fixUp <= SHRT_MAX), XAI_ERR_NORM, \
+                    "\nfixUp = %hi, value must be greater than or equal to -32768 and less than 32768", fixUp);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  valign vaOut = IVP_ZALIGN();
+
+  /* Get Data Pointers */
+  int8_t *pInput            = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  MORPH_ODT_SCALAR *pOutput = (MORPH_ODT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  /* Vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+
+  /* Loop variables */
+  int32_t x, y, z;
+
+  /* Input and Output pointers */
+  xb_vecNx8 *restrict pvecIn;
+  MORPH_ODT_VECTOR *restrict pvecOut;
+
+  /* Input and Output data vectors */
+  xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3;
+  xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3;
+
+  /* Accumulators */
+  xb_vecNx48 vecAcc1, vecAcc2, vecAcc3, vecAcc4;
+
+  xb_vecNx16U vecScale = (xb_vecNx16U) (scale);
+
+  // Assuming that the "fixUpShift" value will reside with S32 range
+  int32_t fixUpShift       = (fixUp << shift);
+  xb_vecNx48 vecFixUpShift = fixUpShift;
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from S8 bit to S16 bit need to done present in contiguous  */
+  /*      memory location. Hence vectorization can be utilized effectively      */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from S8 bit */
+  /*      S16 bit need to done exist in non-contiguous memory location.         */
+  /*      In order to do vectorization across first dimension, output data      */
+  /*      pointers need to be updated based on output tile size and output tile */
+  /*      pitch.                                                                */
+  /******************************************************************************/
+
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* Input and Output vectors */
+    xb_vecNx16 vecInData;
+    xb_vecNx16 vecOut;
+
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* Initialize input and output data pointer */
+      pvecIn  = (xb_vecNx8 *) (pInput + (z * inTilePitch2));
+      pvecOut = (MORPH_ODT_VECTOR *) (pOutput + (z * outTilePitch2));
+      valign vaInData = IVP_LANX8S_PP(pvecIn);
+      int32_t varlen;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* Load input data */
+        IVP_LANX8S_IP(vecInData, vaInData, pvecIn);
+
+        // Initializing the 48-bit accumulator with the 32-bit "fixUpShift" value
+        vecAcc1 = vecFixUpShift;
+        IVP_MULUSANX16(vecAcc1, vecScale, vecInData);
+        // Packing the outcome to appropriate range
+        MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut, vecAcc1, shift);
+
+        /* Store output data */
+        MORPH_OP_SA_IP(vecOut, vaOut, pvecOut);
+      }
+
+      varlen = (maxLoopCount - x);
+      IVP_LANX8S_IP(vecInData, vaInData, pvecIn);
+
+      // Initializing the 48-bit accumulator with the 32-bit "fixUpShift" value
+      vecAcc1 = vecFixUpShift;
+      IVP_MULUSANX16(vecAcc1, vecScale, vecInData);
+      // Packing the outcome to appropriate range
+      MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut, vecAcc1, shift);
+
+      /* Store output data */
+      MORPH_OP_SAV_XP(vecOut, vaOut, pvecOut, (varlen * bytesPerPixel));
+      MORPH_OP_SAPOS_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn           = &pInput[(z * inTilePitch2) + x];
+        MORPH_ODT_SCALAR *pOut = &pOutput[(z * outTilePitch2) + x];
+        int32_t varLen         = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          // Adjusting the input and output data pointers
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (MORPH_ODT_VECTOR *) (pOut + (y * outTilePitch1));
+
+          /* Load Input data */
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData3, vaInData, pvecIn);
+
+          // Initializing the 48-bit accumulators with the 32-bit "fixUpShift" value
+          vecAcc1 = vecFixUpShift;
+          vecAcc2 = vecFixUpShift;
+          vecAcc3 = vecFixUpShift;
+          vecAcc4 = vecFixUpShift;
+
+          IVP_MULUSANX16(vecAcc1, vecScale, vecInData0);
+          IVP_MULUSANX16(vecAcc2, vecScale, vecInData1);
+          IVP_MULUSANX16(vecAcc3, vecScale, vecInData2);
+          IVP_MULUSANX16(vecAcc4, vecScale, vecInData3);
+
+          // Packing the outcome to appropriate range
+          MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut0, vecAcc1, shift);
+          MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut1, vecAcc2, shift);
+          MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut2, vecAcc3, shift);
+          MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut3, vecAcc4, shift);
+
+          /* Store output data */
+          MORPH_OP_SA_IP(vecOut0, vaOut, pvecOut);
+          MORPH_OP_SA_IP(vecOut1, vaOut, pvecOut);
+          MORPH_OP_SA_IP(vecOut2, vaOut, pvecOut);
+          MORPH_OP_SAV_XP(vecOut3, vaOut, pvecOut, (varLen * bytesPerPixel));
+          MORPH_OP_SAPOS_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn           = &pInput[(z * inTilePitch2) + x];
+        MORPH_ODT_SCALAR *pOut = &pOutput[(z * outTilePitch2) + x];
+        int32_t varLen         = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          // Adjusting the input and output data pointers
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (MORPH_ODT_VECTOR *) (pOut + (y * outTilePitch1));
+
+          /* Load input data */
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData2, vaInData, pvecIn);
+
+          // Initializing the 48-bit accumulators with the 32-bit "fixUpShift" value
+          vecAcc1 = vecFixUpShift;
+          vecAcc2 = vecFixUpShift;
+          vecAcc3 = vecFixUpShift;
+
+          IVP_MULUSANX16(vecAcc1, vecScale, vecInData0);
+          IVP_MULUSANX16(vecAcc2, vecScale, vecInData1);
+          IVP_MULUSANX16(vecAcc3, vecScale, vecInData2);
+
+          // Packing the outcome to appropriate range
+          MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut0, vecAcc1, shift);
+          MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut1, vecAcc2, shift);
+          MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut2, vecAcc3, shift);
+
+          /* Store output data */
+          MORPH_OP_SA_IP(vecOut0, vaOut, pvecOut);
+          MORPH_OP_SA_IP(vecOut1, vaOut, pvecOut);
+          MORPH_OP_SAV_XP(vecOut2, vaOut, pvecOut, (varLen * bytesPerPixel));
+          MORPH_OP_SAPOS_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn           = &pInput[(z * inTilePitch2) + x];
+        MORPH_ODT_SCALAR *pOut = &pOutput[(z * outTilePitch2) + x];
+        int32_t varLen         = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          // Adjusting the input and output data pointers
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (MORPH_ODT_VECTOR *) (pOut + (y * outTilePitch1));
+
+          /* Load input data */
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+
+          // Initializing the 48-bit accumulators with the 32-bit "fixUpShift" value
+          vecAcc1 = vecFixUpShift;
+          vecAcc2 = vecFixUpShift;
+
+          IVP_MULUSANX16(vecAcc1, vecScale, vecInData0);
+          IVP_MULUSANX16(vecAcc2, vecScale, vecInData1);
+
+          // Packing the outcome to appropriate range
+          MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut0, vecAcc1, shift);
+          MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut1, vecAcc2, shift);
+
+          /* Store output data */
+          MORPH_OP_SA_IP(vecOut0, vaOut, pvecOut);
+          MORPH_OP_SAV_XP(vecOut1, vaOut, pvecOut, (varLen * bytesPerPixel));
+          MORPH_OP_SAPOS_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn           = &pInput[(z * inTilePitch2) + x];
+        MORPH_ODT_SCALAR *pOut = &pOutput[(z * outTilePitch2) + x];
+        int32_t varLen         = (dim1Size - x);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          // Adjusting the input and output data pointers
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (MORPH_ODT_VECTOR *) (pOut + (y * outTilePitch1));
+
+          /* Load input data */
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+
+          // Initializing the 48-bit accumulator with the 32-bit "fixUpShift" value
+          vecAcc1 = vecFixUpShift;
+          IVP_MULUSANX16(vecAcc1, vecScale, vecInData0);
+
+          // Packing the outcome to appropriate range
+          MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut0, vecAcc1, shift);
+
+          /* Store output data */
+          MORPH_OP_SAV_XP(vecOut0, vaOut, pvecOut, (varLen * bytesPerPixel));
+          MORPH_OP_SAPOS_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+#endif //if ((XCHAL_VISION_TYPE >= 6))
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_I16I8.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_I16I8.h
new file mode 100644
index 00000000000..37fb11f0ca8
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_I16I8.h
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#undef MAKE_NAME_IMPL
+#undef MAKE_NAME
+#undef MORPH_IDT_TILECHECK
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_VECTOR
+#undef MORPH_OP_PRIME
+#undef MORPH_OP_LOAD_IP
+#undef MORPH_OP_MUL
+
+#if INPUT_DATA_TYPE == SIGNED16BIT
+
+#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER)  name ## MORPH_FNAME_SPECIFIER
+#define MAKE_NAME(name)                              MAKE_NAME_IMPL(name, S16I8)
+#define MORPH_IDT_TILECHECK  XAI_CHECK_TILE3D_S16
+#define MORPH_IDT_SCALAR     int16_t
+#define MORPH_IDT_VECTOR     xb_vecNx16
+#define MORPH_OP_PRIME       IVP_LANX16_PP
+#define MORPH_OP_LOAD_IP     IVP_LANX16_IP
+#define MORPH_OP_MUL         IVP_MULUSNX16
+
+#elif INPUT_DATA_TYPE == UNSIGNED16BIT
+
+#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER)  name ## MORPH_FNAME_SPECIFIER
+#define MAKE_NAME(name)                              MAKE_NAME_IMPL(name, U16I8)
+#define MORPH_IDT_TILECHECK  XAI_CHECK_TILE3D_U16
+#define MORPH_IDT_SCALAR     uint16_t
+#define MORPH_IDT_VECTOR     xb_vecNx16U
+#define MORPH_OP_PRIME       IVP_LANX16U_PP
+#define MORPH_OP_LOAD_IP     IVP_LANX16U_IP
+#define MORPH_OP_MUL         IVP_MULUUNX16
+#endif
+
+/********************* xaiDataConversion3D_S16/U16I8 ***************************/
+/* Description : P6 implementation for conversion from S16/U16 to S8 / U8     */
+/*               depending on Output Tile type                                */
+/* Inputs      : Input Tile, scale, shift                                     */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : InData is signed/unsigned 16bit                              */
+/******************************************************************************/
+
+XAI_ERR_TYPE MAKE_NAME (xaiDataConversion3D_)(const xai_pTile3D inTile,
+                                              xai_pTile3D outTile,
+                                              const uint16_t scale,
+                                              const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_TILECHECK(inTile);
+    XAI_CHECK_TILE3D_I8(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \
+                    "Shift value = %hhu, which should be less than 32", shift);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  const int16_t minLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8)) ? SCHAR_MIN : 0;
+  const int16_t maxLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8)) ? SCHAR_MAX : UCHAR_MAX;
+
+  /* Get Data Pointers */
+  MORPH_IDT_SCALAR *pInput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutput          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  valign vaOut             = IVP_ZALIGN();
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  MORPH_IDT_VECTOR * restrict pvecIn;
+  xb_vecNx8U * restrict pvecOut;
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from signed 16 bit to S8/U8 bit need to done present in    */
+  /*      in contiguous memory location. Hence vectorization can be utilized    */
+  /*      effectively                                                  */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from signed */
+  /*      16 bit to S8/U8 bit need to done exist in non-contiguous memory       */
+  /*      location. In order to do vectorization across first dimension, */
+  /*      output data pointers need to be updated based on output tile size     */
+  /*      and output tile pitch                                                 */
+  /******************************************************************************/
+
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input data vectors */
+    MORPH_IDT_VECTOR vecInData;
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (MORPH_IDT_VECTOR *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecNx8U *) (pOutput + (z * outTilePitch2));
+
+      valign vaInData = MORPH_OP_PRIME(pvecIn);
+      xb_vecNx16 vecOut;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* load input data */
+        MORPH_OP_LOAD_IP(vecInData, vaInData, pvecIn);
+
+        /* apply scale and shift to input data.
+         * multiplying with scale results in 32 way 48-bit
+         * data to which shift is applied, so final result is
+         * 32 way 16 bit.
+         */
+        vecOut = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData), shift);
+
+        vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+        /* store output data */
+        IVP_SANX8U_IP(vecOut, vaOut, pvecOut);
+      }
+      /* load input data */
+      MORPH_OP_LOAD_IP(vecInData, vaInData, pvecIn);
+
+      /* apply scale and shift to input data.
+       * multiplying with scale results in 32 way 48-bit
+       * data to which shift is applied, so final result is
+       * 32 way 16 bit.
+       */
+      vecOut = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData), shift);
+
+      vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+      /* store output data */
+      IVP_SAVNX8U_XP(vecOut, vaOut, pvecOut, (maxLoopCount - x));
+      IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        MORPH_IDT_SCALAR * pIn = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut           = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen         = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          MORPH_IDT_VECTOR vecInData0, vecInData1, vecInData2, vecInData3;
+          xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3;
+
+          pvecIn  = (MORPH_IDT_VECTOR *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1));
+          valign vaInData = MORPH_OP_PRIME(pvecIn);
+          /* load input data */
+          MORPH_OP_LOAD_IP(vecInData0, vaInData, pvecIn);
+          MORPH_OP_LOAD_IP(vecInData1, vaInData, pvecIn);
+          MORPH_OP_LOAD_IP(vecInData2, vaInData, pvecIn);
+          MORPH_OP_LOAD_IP(vecInData3, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData0), shift);
+          vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          vecOut1 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData1), shift);
+          vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          vecOut2 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData2), shift);
+          vecOut2 = IVP_MAXNX16(IVP_MINNX16(vecOut2, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          vecOut3 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData3), shift);
+          vecOut3 = IVP_MAXNX16(IVP_MINNX16(vecOut3, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          /* Store output data */
+          IVP_SANX8U_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX8U_IP(vecOut1, vaOut, pvecOut);
+          IVP_SANX8U_IP(vecOut2, vaOut, pvecOut);
+          IVP_SAVNX8U_XP(vecOut3, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        MORPH_IDT_SCALAR * pIn = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut           = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen         = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          MORPH_IDT_VECTOR vecInData0, vecInData1, vecInData2;
+          xb_vecNx16 vecOut0, vecOut1, vecOut2;
+
+          pvecIn  = (MORPH_IDT_VECTOR *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1));
+          valign vaInData = MORPH_OP_PRIME(pvecIn);
+
+          /* load input data */
+          MORPH_OP_LOAD_IP(vecInData0, vaInData, pvecIn);
+          MORPH_OP_LOAD_IP(vecInData1, vaInData, pvecIn);
+          MORPH_OP_LOAD_IP(vecInData2, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData0), shift);
+          vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          vecOut1 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData1), shift);
+          vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          vecOut2 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData2), shift);
+          vecOut2 = IVP_MAXNX16(IVP_MINNX16(vecOut2, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          /* Store output data */
+          IVP_SANX8U_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX8U_IP(vecOut1, vaOut, pvecOut);
+          IVP_SAVNX8U_XP(vecOut2, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        MORPH_IDT_SCALAR * pIn = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut           = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen         = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          MORPH_IDT_VECTOR vecInData0, vecInData1;
+          xb_vecNx16 vecOut0, vecOut1;
+
+          pvecIn  = (MORPH_IDT_VECTOR *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1));
+          valign vaInData = MORPH_OP_PRIME(pvecIn);
+
+          /* load input data */
+          MORPH_OP_LOAD_IP(vecInData0, vaInData, pvecIn);
+          MORPH_OP_LOAD_IP(vecInData1, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData0), shift);
+          vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          vecOut1 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData1), shift);
+          vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          /* Store output data */
+          IVP_SANX8U_IP(vecOut0, vaOut, pvecOut);
+          IVP_SAVNX8U_XP(vecOut1, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        MORPH_IDT_SCALAR * pIn = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut           = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen         = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          MORPH_IDT_VECTOR vecInData0;
+          xb_vecNx16 vecOut0;
+
+          pvecIn  = (MORPH_IDT_VECTOR *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1));
+          valign vaInData = MORPH_OP_PRIME(pvecIn);
+
+          /* load input data */
+          MORPH_OP_LOAD_IP(vecInData0, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData0), shift);
+          vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          /* Store output data */
+          IVP_SAVNX8U_XP(vecOut0, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+#endif //if ((XCHAL_VISION_TYPE >= 6))
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_I8I32.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_I8I32.h
new file mode 100644
index 00000000000..bddbd3058ca
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_I8I32.h
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#undef MAKE_NAME_IMPL
+#undef MAKE_NAME
+#undef MORPH_IDT_TILECHECK
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_VECTOR
+#undef MORPH_IDT_VECTORI8
+#undef MORPH_OP_PRIME
+#undef MORPH_OP_LOAD_IP
+#undef MORPH_OP_MUL
+
+#if INPUT_DATA_TYPE == SIGNED8BIT
+
+#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER)  name ## _ ## MORPH_FNAME_SPECIFIER
+#define MAKE_NAME(name)                              MAKE_NAME_IMPL(name, S8I32)
+#define MORPH_IDT_TILECHECK  XAI_CHECK_TILE3D_S8
+#define MORPH_IDT_SCALAR     int8_t
+#define MORPH_IDT_VECTOR     xb_vecNx16
+#define MORPH_IDT_VECTORI8   xb_vecNx8
+#define MORPH_OP_PRIME       IVP_LANX8S_PP
+#define MORPH_OP_LOAD_IP     IVP_LANX8S_IP
+#define MORPH_OP_MUL         IVP_MULUSNX16
+
+#elif INPUT_DATA_TYPE == UNSIGNED8BIT
+
+#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER)  name ## _ ## MORPH_FNAME_SPECIFIER
+#define MAKE_NAME(name)                              MAKE_NAME_IMPL(name, U8I32)
+#define MORPH_IDT_TILECHECK  XAI_CHECK_TILE3D_U8
+#define MORPH_IDT_SCALAR     uint8_t
+#define MORPH_IDT_VECTOR     xb_vecNx16U
+#define MORPH_IDT_VECTORI8   xb_vecNx8U
+#define MORPH_OP_PRIME       IVP_LANX8U_PP
+#define MORPH_OP_LOAD_IP     IVP_LANX8U_IP
+#define MORPH_OP_MUL         IVP_MULUUNX16
+#endif
+
+
+/********************* xaiDataConversion3D_I8I32 ************************/
+/* Description : P6 implementation for conversion from S8 to S32       */
+/* Inputs      : Input Tile, scale, shift                              */
+/* Outputs     : XI Error Code                                         */
+/* InOuts      : Output Tile                                           */
+/* Assumptions : InData is signed 8bit                                 */
+/***********************************************************************/
+/********************* xaiDataConversion3D_S8I32 ************************/
+/********************* xaiDataConversion3D_U8I32 ************************/
+XAI_ERR_TYPE MAKE_NAME (xaiDataConversion3D)(const xai_pTile3D inTile,
+                                             xai_pTile3D outTile,
+                                             const uint16_t scale,
+                                             const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_TILECHECK(inTile);
+    XAI_CHECK_TILE3D_I32(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 24", shift);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  valign vaOut = IVP_ZALIGN();
+
+  /* Get Data Pointers */
+  MORPH_IDT_SCALAR *pInput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int32_t *pOutput         = (int32_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+
+  const int32_t minLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32)) ? INT_MIN : 0;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+
+  /* input and output pointers */
+  MORPH_IDT_VECTORI8 *restrict pvecIn;
+  xb_vecN_2x32v *restrict pvecOut;
+
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from S8 bit to S16 bit need to done present in contiguous  */
+  /*      memory location. Hence vectorization can be utilized effectively      */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from S8 bit */
+  /*      I32 bit need to done exist in non-contiguous memory location.         */
+  /*      In order to do vectorization across first dimension, output data      */
+  /*      pointers need to be updated based on output tile size and output tile */
+  /*      pitch.                                                                */
+  /******************************************************************************/
+
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input and output vectors */
+    MORPH_IDT_VECTOR vecInData;
+
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    xb_vecN_2x32v vecOutL, vecOutH;
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (MORPH_IDT_VECTORI8 *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecN_2x32v *) (pOutput + (z * outTilePitch2));
+      valign vaInData = MORPH_OP_PRIME(pvecIn);
+      int32_t varlen;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* Load input data */
+        MORPH_OP_LOAD_IP(vecInData, vaInData, pvecIn);
+
+        xb_vecNx48 vecIntRes      = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData);
+        xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes));
+        vecOutL = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+        vecOutL = IVP_MAXN_2X32(vecOutL, (xb_vecN_2x32v) minLim);
+
+        xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes));
+        vecOutH = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+        vecOutH = IVP_MAXN_2X32(vecOutH, (xb_vecN_2x32v) minLim);
+        /* store output data */
+        IVP_SAN_2X32_IP(vecOutL, vaOut, pvecOut);
+        IVP_SAN_2X32_IP(vecOutH, vaOut, pvecOut);
+      }
+      varlen = (maxLoopCount - x);
+      MORPH_OP_LOAD_IP(vecInData, vaInData, pvecIn);
+
+
+      xb_vecNx48 vecIntRes = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData);
+
+      xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes));
+      vecOutL = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+      vecOutL = IVP_MAXN_2X32(vecOutL, (xb_vecN_2x32v) minLim);
+
+      xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes));
+      vecOutH = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+      vecOutH = IVP_MAXN_2X32(vecOutH, (xb_vecN_2x32v) minLim);
+
+      /* store output data */
+      IVP_SAVN_2X32_XP(vecOutL, vaOut, pvecOut, (varlen << 2));
+      IVP_SAVN_2X32_XP(vecOutH, vaOut, pvecOut, ((varlen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)));
+      IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+    MORPH_IDT_VECTOR vecInData0, vecInData1, vecInData2, vecInData3;
+    xb_vecN_2x32v vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H;
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        MORPH_IDT_SCALAR * pIn = &pInput[z * inTilePitch2 + x];
+        int32_t *pOut          = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen         = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (MORPH_IDT_VECTORI8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = MORPH_OP_PRIME(pvecIn);
+          /* load input data */
+          MORPH_OP_LOAD_IP(vecInData0, vaInData, pvecIn);
+          MORPH_OP_LOAD_IP(vecInData1, vaInData, pvecIn);
+          MORPH_OP_LOAD_IP(vecInData2, vaInData, pvecIn);
+          MORPH_OP_LOAD_IP(vecInData3, vaInData, pvecIn);
+
+          xb_vecNx48 vecIntRes0 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecIntRes1 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData1);
+          xb_vecNx48 vecIntRes2 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData2);
+          xb_vecNx48 vecIntRes3 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData3);
+
+          xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+
+          vecOut0L = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+          vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim);
+          xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+          vecOut0H = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+          vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1));
+          vecOut1L    = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+          vecOut1L    = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim);
+          vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1));
+          vecOut1H    = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+          vecOut1H    = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2));
+
+          vecOut2L    = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+          vecOut2L    = IVP_MAXN_2X32(vecOut2L, (xb_vecN_2x32v) minLim);
+          vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2));
+          vecOut2H    = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+          vecOut2H    = IVP_MAXN_2X32(vecOut2H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes3), IVP_CVT64SNX48LL(vecIntRes3));
+          vecOut3L    = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+          vecOut3L    = IVP_MAXN_2X32(vecOut3L, (xb_vecN_2x32v) minLim);
+          vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes3), IVP_CVT64SNX48HL(vecIntRes3));
+          vecOut3H    = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+          vecOut3H    = IVP_MAXN_2X32(vecOut3H, (xb_vecN_2x32v) minLim);
+
+
+
+          /* Store output data */
+          IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut2L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut2H, vaOut, pvecOut);
+
+          IVP_SAVN_2X32_XP(vecOut3L, vaOut, pvecOut, (varLen << 2));
+          IVP_SAVN_2X32_XP(vecOut3H, vaOut, pvecOut, ((varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)));
+          IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        MORPH_IDT_SCALAR *pIn = &pInput[z * inTilePitch2 + x];
+        int32_t *pOut         = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen        = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (MORPH_IDT_VECTORI8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1));
+          valign vaInData = MORPH_OP_PRIME(pvecIn);
+          /* load input data */
+          MORPH_OP_LOAD_IP(vecInData0, vaInData, pvecIn);
+          MORPH_OP_LOAD_IP(vecInData1, vaInData, pvecIn);
+          MORPH_OP_LOAD_IP(vecInData2, vaInData, pvecIn);
+
+          xb_vecNx48 vecIntRes0 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData0);
+
+          xb_vecNx48 vecIntRes1 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData1);
+
+          xb_vecNx48 vecIntRes2 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData2);
+
+          xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0L = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+          vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim);
+          xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+          vecOut0H = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+          vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1));
+          vecOut1L    = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+          vecOut1L    = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim);
+          vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1));
+          vecOut1H    = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+          vecOut1H    = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2));
+          vecOut2L    = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+          vecOut2L    = IVP_MAXN_2X32(vecOut2L, (xb_vecN_2x32v) minLim);
+          vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2));
+          vecOut2H    = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+          vecOut2H    = IVP_MAXN_2X32(vecOut2H, (xb_vecN_2x32v) minLim);
+
+          /* Store output data */
+          IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAVN_2X32_XP(vecOut2L, vaOut, pvecOut, (varLen << 2));
+          IVP_SAVN_2X32_XP(vecOut2H, vaOut, pvecOut, ((varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)));
+          IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        MORPH_IDT_SCALAR *pIn = &pInput[z * inTilePitch2 + x];
+        int32_t *pOut         = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen        = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (MORPH_IDT_VECTORI8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1));
+          valign vaInData = MORPH_OP_PRIME(pvecIn);
+
+          /* load input data */
+          MORPH_OP_LOAD_IP(vecInData0, vaInData, pvecIn);
+          MORPH_OP_LOAD_IP(vecInData1, vaInData, pvecIn);
+
+          xb_vecNx48 vecIntRes0 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData0);
+
+          xb_vecNx48 vecIntRes1 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData1);
+
+          xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0L = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+          vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim);
+          xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+          vecOut0H = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+          vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1));
+          vecOut1L    = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+          vecOut1L    = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim);
+          vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1));
+          vecOut1H    = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+          vecOut1H    = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim);
+
+
+          /* Store output data */
+          IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAVN_2X32_XP(vecOut1L, vaOut, pvecOut, (varLen << 2));
+          IVP_SAVN_2X32_XP(vecOut1H, vaOut, pvecOut, ((varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)));
+          IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        MORPH_IDT_SCALAR *pIn = &pInput[z * inTilePitch2 + x];
+        int32_t *pOut         = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen        = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (MORPH_IDT_VECTORI8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1));
+          valign vaInData = MORPH_OP_PRIME(pvecIn);
+          /* load input data */
+          MORPH_OP_LOAD_IP(vecInData0, vaInData, pvecIn);
+
+          xb_vecNx48 vecIntRes0 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData0);
+
+          xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0L = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+          vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim);
+          xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+          vecOut0H = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+          vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim);
+
+          /* Store output data */
+          IVP_SAVN_2X32_XP(vecOut0L, vaOut, pvecOut, (varLen << 2));
+          IVP_SAVN_2X32_XP(vecOut0H, vaOut, pvecOut, ((varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)));
+          IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+#endif //if ((XCHAL_VISION_TYPE >= 6))
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_S32IX.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_S32IX.h
new file mode 100644
index 00000000000..a5484d43644
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_S32IX.h
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#undef MAKE_NAME_IMPL
+#undef MAKE_NAME
+#undef MORPH_ODT_TILECHECK
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MIN_VAL
+#undef MAX_VAL
+#undef MORPH_STORE_SA_IP
+#undef MORPH_STORE_SAV_XP
+#undef MORPH_FLUSH_SAPOS
+#undef BytesPerPixel
+
+#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER)  name ## MORPH_FNAME_SPECIFIER
+
+#if ((INPUT_DATA_TYPE == SIGNED32BIT) && (OUTPUT_DATA_TYPE == SIGNED8BIT))
+#define MAKE_NAME(name)  MAKE_NAME_IMPL(name, S8)
+#define MORPH_ODT_TILECHECK  XAI_CHECK_TILE3D_S8
+#define MORPH_ODT_SCALAR     int8_t
+#define MORPH_ODT_VECTOR     xb_vecNx8
+#define MIN_VAL              SCHAR_MIN
+#define MAX_VAL              SCHAR_MAX
+#define MORPH_STORE_SA_IP    IVP_SANX8S_IP
+#define MORPH_STORE_SAV_XP   IVP_SAVNX8S_XP
+#define MORPH_FLUSH_SAPOS    IVP_SAPOSNX8S_FP
+#define BytesPerPixel        1
+
+
+#elif ((INPUT_DATA_TYPE == SIGNED32BIT) && (OUTPUT_DATA_TYPE == UNSIGNED8BIT))
+#define MAKE_NAME(name)  MAKE_NAME_IMPL(name, U8)
+#define MORPH_ODT_TILECHECK  XAI_CHECK_TILE3D_U8
+#define MORPH_ODT_SCALAR     uint8_t
+#define MORPH_ODT_VECTOR     xb_vecNx8U
+#define MIN_VAL              0
+#define MAX_VAL              UCHAR_MAX
+#define MORPH_STORE_SA_IP    IVP_SANX8U_IP
+#define MORPH_STORE_SAV_XP   IVP_SAVNX8U_XP
+#define MORPH_FLUSH_SAPOS    IVP_SAPOSNX8U_FP
+#define BytesPerPixel        1
+
+#elif ((INPUT_DATA_TYPE == SIGNED32BIT) && (OUTPUT_DATA_TYPE == SIGNED16BIT))
+#define MAKE_NAME(name)  MAKE_NAME_IMPL(name, S16)
+#define MORPH_ODT_TILECHECK  XAI_CHECK_TILE3D_S16
+#define MORPH_ODT_SCALAR     int16_t
+#define MORPH_ODT_VECTOR     xb_vecNx16
+#define MIN_VAL              SHRT_MIN
+#define MAX_VAL              SHRT_MAX
+#define MORPH_STORE_SA_IP    IVP_SANX16_IP
+#define MORPH_STORE_SAV_XP   IVP_SAVNX16_XP
+#define MORPH_FLUSH_SAPOS    IVP_SAPOSNX16_FP
+#define BytesPerPixel        2
+
+#elif ((INPUT_DATA_TYPE == SIGNED32BIT) && (OUTPUT_DATA_TYPE == UNSIGNED16BIT))
+#define MAKE_NAME(name)  MAKE_NAME_IMPL(name, U16)
+#define MORPH_ODT_TILECHECK  XAI_CHECK_TILE3D_U16
+#define MORPH_ODT_SCALAR     uint16_t
+#define MORPH_ODT_VECTOR     xb_vecNx16U
+#define MIN_VAL              0
+#define MAX_VAL              USHRT_MAX
+#define MORPH_STORE_SA_IP    IVP_SANX16U_IP
+#define MORPH_STORE_SAV_XP   IVP_SAVNX16U_XP
+#define MORPH_FLUSH_SAPOS    IVP_SAPOSNX16U_FP
+#define BytesPerPixel        2
+#endif
+
+
+/********************* xaiDataConversion3D_S32IX ******************************/
+/* Description : P6 implementation for conversion from S32 to S8 /U8/S16/U16  */
+/*               depending on Output Tile type                                */
+/* Inputs      : Input Tile, scale, shift                                     */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : InData is signed 32 bit                                      */
+/******************************************************************************/
+/********************* xaiDataConversion3D_S32S8  *****************************/
+/********************* xaiDataConversion3D_S32U8  *****************************/
+/********************* xaiDataConversion3D_S32S16 ******************************/
+/********************* xaiDataConversion3D_S32U16 *****************************/
+XAI_ERR_TYPE MAKE_NAME (xaiDataConversion3D_S32)(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const uint16_t scale,
+                                                 const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S32(inTile);
+    MORPH_ODT_TILECHECK(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 32", shift);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t minLim = MIN_VAL;
+  int32_t maxLim = MAX_VAL;
+
+  /* Get Data Pointers */
+  int32_t *pInput           = (int32_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  MORPH_ODT_SCALAR *pOutput = (MORPH_ODT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  valign vaOut = IVP_ZALIGN();
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH / 2;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecN_2x32v * restrict pvecIn;
+  MORPH_ODT_VECTOR * restrict pvecOut;
+
+  xb_vecN_2x64w vec0scaledIn64B, vec1scaledIn64B;
+
+  /* SCALE*/
+  xb_vecNx16U vecScale = (xb_vecNx16U) (scale);
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from signed 32 bit to S8/U8 bit need to done present in    */
+  /*      in contiguous memory location. Hence vectorization can be utilized    */
+  /*      effectively                                                           */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from signed */
+  /*      32 bit to S8/U8 bit need to done exist in non-contiguous memory       */
+  /*      location. In order to do vectorization across first dimension, */
+  /*      output data pointers need to be updated based on output tile size     */
+  /*      and output tile pitch                                                 */
+  /******************************************************************************/
+
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input data vectors */
+    xb_vecN_2x32v vecInData0, vecInData1;
+
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecN_2x32v *) (pInput + (z * inTilePitch2));
+      pvecOut = (MORPH_ODT_VECTOR *) (pOutput + (z * outTilePitch2));
+
+      valign vaInData = IVP_LAN_2X32_PP(pvecIn);
+      xb_vecNx16 vecOut, vecOut0, vecOut1;
+      x = 0;
+      for (; x < maxLoopCount - vectorizationWidth2X; x += vectorizationWidth2X)
+      {
+        /* Load input data */
+        IVP_LAN_2X32_IP(vecInData0, vaInData, pvecIn);
+        IVP_LAN_2X32_IP(vecInData1, vaInData, pvecIn);
+
+        /* Multiply U16 scale with S32 input and store in 64-bit wide vector */
+        vec0scaledIn64B = IVP_MULUSN_2X16X32_0(vecScale, vecInData0);
+        vec1scaledIn64B = IVP_MULUSN_2X16X32_0(vecScale, vecInData1);
+
+        /* Pack the 64-bit wide vector in 32 bit vecotr, by applying shift */
+        xb_vecN_2x32v vec0scaledIn32B = IVP_PACKVRN_2X64W(vec0scaledIn64B, shift);
+        xb_vecN_2x32v vec1scaledIn32B = IVP_PACKVRN_2X64W(vec1scaledIn64B, shift);
+
+        /* CLAMP the 32bit scaled-shift data to minLim and maxLim, & store it
+         * in 16-bit vector, whose odd lanes (1, 3, 5...) are zeroes.*/
+        vecOut0 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec0scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim));
+        vecOut1 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec1scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim));
+
+        /* Select the actual data present at even lanes, i.e. 0, 2, 4,...  */
+        vecOut = IVP_SELNX16I(vecOut1, vecOut0, IVP_SELI_EXTRACT_1_OF_2_OFF_0);
+
+        /* store output data */
+        MORPH_STORE_SA_IP(vecOut, vaOut, pvecOut);
+      }
+
+      /* Load remaining input data */
+      IVP_LAVN_2X32_XP(vecInData0, vaInData, pvecIn, (maxLoopCount - x) * 4);
+      IVP_LAVN_2X32_XP(vecInData1, vaInData, pvecIn, ((maxLoopCount - x) - (vectorizationWidth >> 1)) * 4);
+
+      /* Multiply U16 scale with S32 input and store in 64-bit wide vector */
+      vec0scaledIn64B = IVP_MULUSN_2X16X32_0(vecScale, vecInData0);
+      vec1scaledIn64B = IVP_MULUSN_2X16X32_0(vecScale, vecInData1);
+
+      /* Pack the 64-bit wide vector in 32 bit vecotr, by applying shift */
+      xb_vecN_2x32v vec0scaledIn32B = IVP_PACKVRN_2X64W(vec0scaledIn64B, shift);
+      xb_vecN_2x32v vec1scaledIn32B = IVP_PACKVRN_2X64W(vec1scaledIn64B, shift);
+
+      /* CLAMP the 32bit scaled-shift data to minLim and maxLim, & store it
+       * in 16-bit vector, whose odd lanes (1, 3, 5...) are zeroes.*/
+      vecOut0 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec0scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim));
+      vecOut1 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec1scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim));
+
+      /* Select the actual data present at even lanes, i.e. 0, 2, 4,...  */
+      vecOut = IVP_SELNX16I(vecOut1, vecOut0, IVP_SELI_EXTRACT_1_OF_2_OFF_0);
+
+      /* store output data */
+      MORPH_STORE_SAV_XP(vecOut, vaOut, pvecOut, (maxLoopCount - x) * BytesPerPixel);
+      MORPH_FLUSH_SAPOS(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)                 /* along 3rd dimension */
+    {
+      x = 0;
+      for (; x < dim1Size; x += vectorizationWidth2X) /* Load two vectors along 1st dimension*/
+      {
+        /* Initialize input and output data pointer */
+        int32_t * pIn          = &pInput[z * inTilePitch2 + x];
+        MORPH_ODT_SCALAR *pOut = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen         = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++)              /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecN_2x32v vecInData0, vecInData1;
+          xb_vecNx16 vecOut0, vecOut1, vecOut;
+
+          pvecIn  = (xb_vecN_2x32v *) (pIn + (y * inTilePitch1));
+          pvecOut = (MORPH_ODT_VECTOR *) (pOut + (y * outTilePitch1));
+
+          /* Load input data */
+          valign vaInData = IVP_LAN_2X32_PP(pvecIn);
+          IVP_LAVN_2X32_XP(vecInData0, vaInData, pvecIn, varLen * 4);
+          IVP_LAVN_2X32_XP(vecInData1, vaInData, pvecIn, (varLen - (vectorizationWidth >> 1)) * 4);
+
+          /* Multiply U16 scale with S32 input and store in 64-bit wide vector */
+          vec0scaledIn64B = IVP_MULUSN_2X16X32_0(vecScale, vecInData0);
+          vec1scaledIn64B = IVP_MULUSN_2X16X32_0(vecScale, vecInData1);
+
+          /* Pack the 64-bit wide vector in 32 bit vecotr, by applying shift */
+          xb_vecN_2x32v vec0scaledIn32B = IVP_PACKVRN_2X64W(vec0scaledIn64B, shift);
+          xb_vecN_2x32v vec1scaledIn32B = IVP_PACKVRN_2X64W(vec1scaledIn64B, shift);
+
+          /* CLAMP the 32bit scaled-shift data to minLim and maxLim, & store it
+           * in 16-bit vector, whose odd lanes (1, 3, 5...) are zeroes.*/
+          vecOut0 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec0scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim));
+          vecOut1 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec1scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim));
+
+          /* Select the actual data present at even lanes, i.e. 0, 2, 4,...  */
+          vecOut = IVP_SELNX16I(vecOut1, vecOut0, IVP_SELI_EXTRACT_1_OF_2_OFF_0);
+
+          /* Store output data */
+          MORPH_STORE_SAV_XP(vecOut, vaOut, pvecOut, varLen * BytesPerPixel);
+          MORPH_FLUSH_SAPOS(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+#endif //#if ((XCHAL_VISION_TYPE >= 6))
+
+
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_datatransform.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_datatransform.c
new file mode 100644
index 00000000000..dc7fb38a576
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_datatransform.c
@@ -0,0 +1,7835 @@
+/*
+ * Copyright (c) 2022 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include <string.h>
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#undef INPUT_DATA_TYPE
+#undef OUTPUT_DATA_TYPE
+
+#define INPUT_DATA_TYPE  INTEGER8BIT
+#include "cnn_fill_tile.h"
+#undef INPUT_DATA_TYPE
+
+#define INPUT_DATA_TYPE  INTEGER16BIT
+#include "cnn_fill_tile.h"
+#undef INPUT_DATA_TYPE
+
+#if (XCHAL_HAVE_VISION_HP_VFPU == 1)
+#define INPUT_DATA_TYPE  FLOAT16BIT
+#include "cnn_fill_tile.h"
+#undef INPUT_DATA_TYPE
+#endif
+
+#if (XCHAL_HAVE_VISION_SP_VFPU == 1)
+#define INPUT_DATA_TYPE  FLOAT32BIT
+#include "cnn_fill_tile.h"
+#undef INPUT_DATA_TYPE
+#endif
+
+#define INPUT_DATA_TYPE  INTEGER8BIT
+#include "cnn_extend_edge.h"
+#undef INPUT_DATA_TYPE
+
+#define INPUT_DATA_TYPE  INTEGER16BIT
+#include "cnn_extend_edge.h"
+#undef INPUT_DATA_TYPE
+
+#if (XCHAL_HAVE_VISION_HP_VFPU == 1)
+#define INPUT_DATA_TYPE  FLOAT16BIT
+#include "cnn_extend_edge.h"
+#undef INPUT_DATA_TYPE
+#endif
+
+#if (XCHAL_HAVE_VISION_SP_VFPU == 1)
+#define INPUT_DATA_TYPE  FLOAT32BIT
+#include "cnn_extend_edge.h"
+#undef INPUT_DATA_TYPE
+#endif
+
+#define INPUT_DATA_TYPE  SIGNED16BIT
+#include "cnn_dataConversion3D_I16I8.h"
+#undef INPUT_DATA_TYPE
+
+#define INPUT_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_dataConversion3D_I16I8.h"
+#undef INPUT_DATA_TYPE
+
+#define INPUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_dataConversion3D_I8I32.h"
+#undef INPUT_DATA_TYPE
+
+#define INPUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_dataConversion3D_I8I32.h"
+#undef INPUT_DATA_TYPE
+
+#define INPUT_DATA_TYPE   SIGNED32BIT
+#define OUTPUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_dataConversion3D_S32IX.h"
+#undef INPUT_DATA_TYPE
+#undef OUTPUT_DATA_TYPE
+
+#define INPUT_DATA_TYPE   SIGNED32BIT
+#define OUTPUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_dataConversion3D_S32IX.h"
+#undef INPUT_DATA_TYPE
+#undef OUTPUT_DATA_TYPE
+
+#define INPUT_DATA_TYPE   SIGNED32BIT
+#define OUTPUT_DATA_TYPE  SIGNED16BIT
+#include "cnn_dataConversion3D_S32IX.h"
+#undef INPUT_DATA_TYPE
+#undef OUTPUT_DATA_TYPE
+
+#define INPUT_DATA_TYPE   SIGNED32BIT
+#define OUTPUT_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_dataConversion3D_S32IX.h"
+#undef INPUT_DATA_TYPE
+#undef OUTPUT_DATA_TYPE
+
+#define OUTPUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_dataConversion3D_AsymQ_S8IX.h"
+#undef OUTPUT_DATA_TYPE
+
+#define OUTPUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_dataConversion3D_AsymQ_S8IX.h"
+#undef OUTPUT_DATA_TYPE
+
+#define OUTPUT_DATA_TYPE  SIGNED16BIT
+#include "cnn_dataConversion3D_AsymQ_S8IX.h"
+#undef OUTPUT_DATA_TYPE
+
+#define OUTPUT_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_dataConversion3D_AsymQ_S8IX.h"
+#undef OUTPUT_DATA_TYPE
+
+#define PACK_ROUND_U16(vecOut1, vecInData1, Scale, Shift)  {                                           \
+    xb_vecNx48 acc          = IVP_MULUSNX16((xb_vecNx16U) Scale, vecInData1);                          \
+    xb_vecN_2x32v m_outEven = IVP_PACKVRNX48_0(acc, Shift);                                            \
+    xb_vecN_2x32v m_outOdd  = IVP_PACKVRNX48_1(acc, Shift);                                            \
+    m_outEven = IVP_MAXN_2X32(IVP_MINN_2X32(m_outEven, (xb_vecN_2x32v) USHRT_MAX), (xb_vecN_2x32v) 0); \
+    m_outOdd  = IVP_MAXN_2X32(IVP_MINN_2X32(m_outOdd, (xb_vecN_2x32v) USHRT_MAX), (xb_vecN_2x32v) 0);  \
+    xb_vecNx16U temp1 = IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(m_outEven));                      \
+    xb_vecNx16U temp2 = IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(m_outOdd));                       \
+    vecOut1 = IVP_SELNX16UI(temp2, temp1, IVP_SELI_16B_INTERLEAVE_1_EVEN);                             \
+}
+
+/*************************** xaiFillTile3D ***********************************/
+/* Description : General API for FillTile3D optimized implementation        */
+/*               Calls one of the FillTile3D functions based                */
+/*               on the parameters                                          */
+/* Inputs      : constant value to fill, fillEdgeExtension                  */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Destination Tile                                           */
+/****************************************************************************/
+XAI_ERR_TYPE xaiFillTile3D(xai_pTile3D dstTile,
+                           const int32_t value,
+                           xai_bool fillEdgeExtension)
+{
+  if (!dstTile)
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(dstTile, XAI_U8))
+  {
+    return(xaiFillTile3D_I8(dstTile, value, fillEdgeExtension));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(dstTile, XAI_U16))
+  {
+    return(xaiFillTile3D_I16(dstTile, value, fillEdgeExtension));
+  }
+#if (XCHAL_HAVE_VISION_HP_VFPU == 1)
+  else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_F16))
+  {
+    return(xaiFillTile3D_F16(dstTile, value, fillEdgeExtension));
+  }
+#endif
+#if (XCHAL_HAVE_VISION_SP_VFPU == 1)
+  else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_F32))
+  {
+    return(xaiFillTile3D_F32(dstTile, value, fillEdgeExtension));
+  }
+#endif
+  return(XAI_ERR_NO_VARIANT);
+}
+
+/************************* xaiExtendEdgesConst3D *****************************/
+/* Description : General API for ExtendEdgesConst3D optimized implementation*/
+/*               Calls one of the ExtendEdgesConst3D functions based        */
+/*               on the parameters                                          */
+/* Inputs      : constant value to fill the edges                           */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Destination Tile                                           */
+/****************************************************************************/
+XAI_ERR_TYPE xaiExtendEdgesConst3D(xai_pTile3D dstTile,
+                                   const int32_t value,
+                                   xai_size3D frame3DSize)
+{
+  if (!dstTile)
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(dstTile, XAI_U8))
+  {
+    return(xaiExtendEdgesConst3D_I8(dstTile, value, frame3DSize));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(dstTile, XAI_U16))
+  {
+    return(xaiExtendEdgesConst3D_I16(dstTile, value, frame3DSize));
+  }
+#if (XCHAL_HAVE_VISION_HP_VFPU == 1)
+  else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_F16))
+  {
+    int16_t valueS16 = (int16_t) value;
+#if defined(__XTENSA__)
+    xb_f16 valueF16;
+    memcpy(&valueF16, &valueS16, sizeof(int16_t));
+#else
+    xb_f16 valueF16 = *(xb_f16 *) (&valueS16);
+#endif
+    return(xaiExtendEdgesConst3D_F16(dstTile, valueF16, frame3DSize));
+  }
+#endif
+#if (XCHAL_HAVE_VISION_SP_VFPU == 1)
+  else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_F32))
+  {
+    int32_t valueS32 = (int32_t) value;
+    float valueF32;
+    memcpy(&valueF32, &valueS32, sizeof(int32_t));
+    return(xaiExtendEdgesConst3D_F32(dstTile, valueF32, frame3DSize));
+  }
+#endif
+  return(XAI_ERR_NO_VARIANT);
+}
+
+/***********************   xaiExtendEdges3D   ********************************/
+/* Description : General API for ExtendEdges3D optimized implementation     */
+/*               Calls one of the ExtendEdges3D functions based             */
+/*               on the parameters                                          */
+/* Inputs      : pArray, frame3DSize                                        */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Input Tile                                                 */
+/****************************************************************************/
+XAI_ERR_TYPE xaiExtendEdges3D(xai_pTile3D dstTile,
+                              const xai_pArray pArray,
+                              xai_size3D frame3DSize)
+{
+  if (!dstTile)
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(dstTile, XAI_U8))
+  {
+    return(xaiExtendEdges3D_I8(dstTile, pArray, frame3DSize));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(dstTile, XAI_U16))
+  {
+    return(xaiExtendEdges3D_I16(dstTile, pArray, frame3DSize));
+  }
+#if (XCHAL_HAVE_VISION_HP_VFPU == 1)
+  else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_F16))
+  {
+    return(xaiExtendEdges3D_F16(dstTile, pArray, frame3DSize));
+  }
+#endif
+#if (XCHAL_HAVE_VISION_SP_VFPU == 1)
+  else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_F32))
+  {
+    return(xaiExtendEdges3D_F32(dstTile, pArray, frame3DSize));
+  }
+#endif
+  return(XAI_ERR_NO_VARIANT);
+}
+
+/************************** xaiCopyTile3D  ***********************************/
+/* Description : P6 optimized implementation for copying the contents of a  */
+/*               3D tile to another 3D tile. This function supports copying */
+/*               of 8/16/32/64 bit input tile data based on data type of    */
+/*               tile data elements. copy_edge_extension flag is used to    */
+/*               control copy of edges. If edge sizes are different, then   */
+/*               minimum of input & output edge size number of elements is  */
+/*               copied from edges.                                         */
+/* Inputs      : Input Tile data, copy_edge_extension,                      */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : Active data size of input & output tiles are the same      */
+/****************************************************************************/
+
+XAI_ERR_TYPE xaiCopyTile3D(const xai_pTile3D inTile,
+                           xai_pTile3D outTile,
+                           xai_bool copy_edge_extension)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D(inTile);
+    XAI_CHECK_TILE3D(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_ERROR((((XAI_TILE3D_GET_ELEMENT_SIZE(inTile) == 1) || (XAI_TILE3D_GET_ELEMENT_SIZE(inTile) == 2)) ||                \
+                     (XAI_TILE3D_GET_ELEMENT_SIZE(inTile) == 4) || (XAI_TILE3D_GET_ELEMENT_SIZE(inTile) == 8)), XAI_ERR_DATATYPE, \
+                    "Element size of Input tile = %d, The argument of input tile has unsupported data type",                      \
+                    XAI_TILE3D_GET_ELEMENT_SIZE(inTile));
+    XAI_CHECK_TILE3D_ELEMENT_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Getting parameters from the tile structures                               */
+  /* Tile size across first dimension of input tile and output tile is scaled  */
+  /* based on input data type of tile data elements                            */
+
+  const int32_t element_size  = XAI_TILE3D_GET_ELEMENT_SIZE(inTile);
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile) * element_size;
+  const int32_t inDim1Edge1   = XAI_TILE3D_GET_DIM1_EDGE1(inTile) * element_size;
+  const int32_t inDim1Edge2   = XAI_TILE3D_GET_DIM1_EDGE2(inTile) * element_size;
+  const int32_t outDim1Edge1  = XAI_TILE3D_GET_DIM1_EDGE1(outTile) * element_size;
+  const int32_t outDim1Edge2  = XAI_TILE3D_GET_DIM1_EDGE2(outTile) * element_size;
+  const int32_t inDataPitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile) * element_size;
+  const int32_t inDataPitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile) * element_size;
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile) * element_size;
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile) * element_size;
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inDim2Edge1   = XAI_TILE3D_GET_DIM2_EDGE1(inTile);
+  const int32_t inDim2Edge2   = XAI_TILE3D_GET_DIM2_EDGE2(inTile);
+  const int32_t inDim3Edge1   = XAI_TILE3D_GET_DIM3_EDGE1(inTile);
+  const int32_t inDim3Edge2   = XAI_TILE3D_GET_DIM3_EDGE2(inTile);
+  const int32_t outDim2Edge1  = XAI_TILE3D_GET_DIM2_EDGE1(outTile);
+  const int32_t outDim2Edge2  = XAI_TILE3D_GET_DIM2_EDGE2(outTile);
+  const int32_t outDim3Edge1  = XAI_TILE3D_GET_DIM3_EDGE1(outTile);
+  const int32_t outDim3Edge2  = XAI_TILE3D_GET_DIM3_EDGE2(outTile);
+  /* Vectorization for xaiCopyTile3D function is always done across the first dimension */
+  int32_t vectorizationWidth   = 2 * XCHAL_IVPN_SIMD_WIDTH;
+  int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  int32_t vectorizationWidth4X = vectorizationWidth * 4;
+
+  int8_t *pInput  = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  int32_t z, x, y;
+  int32_t dim1CopySize = dim1Size;
+  int32_t dim2CopySize = dim2Size;
+  int32_t dim3CopySize = dim3Size;
+  int32_t dim1CopyEdge1Size;
+  int32_t dim2CopyEdge1Size;
+  int32_t dim3CopyEdge1Size;
+  int32_t dim1CopyEdge2Size;
+  int32_t dim2CopyEdge2Size;
+  int32_t dim3CopyEdge2Size;
+  int32_t maxLoopCount;
+  valign vaInData;
+  valign vaOutData = IVP_ZALIGN();
+  xb_vec2Nx8* restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8 vecValue;
+
+  /* If copy_edge_extension flag is enabled update input and output data pointer  */
+  /* and data copy size across all 3 dimensions.                                 */
+
+  if (copy_edge_extension)
+  {
+    dim1CopyEdge1Size = XT_MIN(inDim1Edge1, outDim1Edge1);
+    dim2CopyEdge1Size = XT_MIN(inDim2Edge1, outDim2Edge1);
+    dim3CopyEdge1Size = XT_MIN(inDim3Edge1, outDim3Edge1);
+    dim1CopyEdge2Size = XT_MIN(inDim1Edge2, outDim1Edge2);
+    dim2CopyEdge2Size = XT_MIN(inDim2Edge2, outDim2Edge2);
+    dim3CopyEdge2Size = XT_MIN(inDim3Edge2, outDim3Edge2);
+    dim1CopySize      = dim1Size + dim1CopyEdge1Size + dim1CopyEdge2Size;
+    dim2CopySize      = dim2Size + dim2CopyEdge1Size + dim2CopyEdge2Size;
+    dim3CopySize      = dim3Size + dim3CopyEdge1Size + dim3CopyEdge2Size;
+    pInput            = &pInput[-dim1CopyEdge1Size + ((-dim2CopyEdge1Size) * inDataPitch1) \
+                                + ((-dim3CopyEdge1Size) * inDataPitch2)];
+    pOutput = &pOutput[-dim1CopyEdge1Size + ((-dim2CopyEdge1Size) * outDataPitch1) \
+                       + ((-dim3CopyEdge1Size) * outDataPitch2)];
+  }
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When output tile pitch is equal to output tile copy size.               */
+  /*    - If above condition holds good, memory location to be copied           */
+  /*      from inTile to outTile is contiguous. Hence vectorization can be      */
+  /*      utilized effectively                                                  */
+  /* 2. When output tile pitch is greater than output tile copy size.           */
+  /*    - If above condition holds good, memory location to be copied           */
+  /*      from inTile to outTile is contiguous. In order to do                  */
+  /*      vectorization across first dimension, output data pointers            */
+  /*      need to be updated based on output tile copy size and                 */
+  /*      output tile pitch                                                     */
+  /******************************************************************************/
+
+  if ((inDataPitch1 == dim1CopySize) && (outDataPitch1 == dim1CopySize))
+  {
+    /* Data to be copied exist in contiguous memory location with respect to */
+    /* first dimension                                                       */
+
+    /* Initialize max loop counter */
+    int32_t maxdim3LoopCount = dim3CopySize;
+    maxLoopCount = dim1CopySize * dim2CopySize;
+
+    if ((inDataPitch2 == maxLoopCount) && (outDataPitch2 == maxLoopCount))
+    {
+      /* Data to be filled exist in contiguous memory location with respect to */
+      /* first and second dimension                                            */
+
+      /* Update max loop counter */
+      maxdim3LoopCount = 1;
+      maxLoopCount    *= dim3CopySize;
+    }
+    for (z = 0; z < maxdim3LoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pdvecIn  = (xb_vec2Nx8 *) (pInput + (z * inDataPitch2));
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (z * outDataPitch2));
+      vaInData = IVP_LA2NX8_PP(pdvecIn);
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* Read vector input data */
+        IVP_LA2NX8_IP(vecValue, vaInData, pdvecIn);
+        /* Store vector output data */
+        IVP_SA2NX8_IP(vecValue, vaOutData, pdvecOut);
+      }
+
+      IVP_LAV2NX8_XP(vecValue, vaInData, pdvecIn, maxLoopCount - x);
+      IVP_SAV2NX8_XP(vecValue, vaOutData, pdvecOut, maxLoopCount - x);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+    }
+  }
+  else
+  {
+    /* else block execute, if output tile pitch is  greater than output tile copy size   */
+    /* or input tile pitch in not equal to output tile pitch                             */
+
+    for (z = 0; z < dim3CopySize; z++) /* Loop across dim3 */
+    {
+      x = 0;
+      /* Loop across dimension 1 */
+
+      /* Condition check added to maximize vectorization across dimension 1*/
+      /* Loop across dim1 */
+      for (; x < (dim1CopySize - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* initialize input and output data pointer */
+        int8_t *pInput1  = pInput + x + (z * inDataPitch2);
+        int8_t *pOutput1 = pOutput + x + (z * outDataPitch2);
+        int32_t varLen   = dim1CopySize - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2CopySize; y++)
+        {
+          pdvecIn  = (xb_vec2Nx8 *) (pInput1 + (y * inDataPitch1));
+          pdvecOut = (xb_vec2Nx8 *) (pOutput1 + (y * outDataPitch1));
+          vaInData = IVP_LA2NX8_PP(pdvecIn);
+
+          /* Read vector data from inTile and copy vector data to outTile */
+          IVP_LA2NX8_IP(vecValue, vaInData, pdvecIn);
+          IVP_SA2NX8_IP(vecValue, vaOutData, pdvecOut);
+          IVP_LA2NX8_IP(vecValue, vaInData, pdvecIn);
+          IVP_SA2NX8_IP(vecValue, vaOutData, pdvecOut);
+          IVP_LA2NX8_IP(vecValue, vaInData, pdvecIn);
+          IVP_SA2NX8_IP(vecValue, vaOutData, pdvecOut);
+          IVP_LAV2NX8_XP(vecValue, vaInData, pdvecIn, varLen);
+          IVP_SAV2NX8_XP(vecValue, vaOutData, pdvecOut, varLen);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+      }
+      if (x < (dim1CopySize - vectorizationWidth2X)) /* Loop unrolling across dim2 */
+      {
+        /* initialize input and output data pointer */
+        int8_t *pInput1  = pInput + x + (z * inDataPitch2);
+        int8_t *pOutput1 = pOutput + x + (z * outDataPitch2);
+        int32_t varLen   = dim1CopySize - (x + vectorizationWidth2X);
+        for (y = 0; y < dim2CopySize; y++)
+        {
+          pdvecIn  = (xb_vec2Nx8 *) (pInput1 + (y * inDataPitch1));
+          pdvecOut = (xb_vec2Nx8 *) (pOutput1 + (y * outDataPitch1));
+          vaInData = IVP_LA2NX8_PP(pdvecIn);
+
+          /* Read vector data from inTile and copy vector data to outTile */
+          IVP_LA2NX8_IP(vecValue, vaInData, pdvecIn);
+          IVP_SA2NX8_IP(vecValue, vaOutData, pdvecOut);
+          IVP_LA2NX8_IP(vecValue, vaInData, pdvecIn);
+          IVP_SA2NX8_IP(vecValue, vaOutData, pdvecOut);
+          IVP_LAV2NX8_XP(vecValue, vaInData, pdvecIn, varLen);
+          IVP_SAV2NX8_XP(vecValue, vaOutData, pdvecOut, varLen);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+      }
+      else if (x < (dim1CopySize - vectorizationWidth))
+      {
+        /* initialize input and output data pointer */
+        int8_t *pInput1  = pInput + x + (z * inDataPitch2);
+        int8_t *pOutput1 = pOutput + x + (z * outDataPitch2);
+        int32_t varLen   = dim1CopySize - (x + vectorizationWidth);
+        for (y = 0; y < dim2CopySize; y++)
+        {
+          pdvecIn  = (xb_vec2Nx8 *) (pInput1 + (y * inDataPitch1));
+          pdvecOut = (xb_vec2Nx8 *) (pOutput1 + (y * outDataPitch1));
+          vaInData = IVP_LA2NX8_PP(pdvecIn);
+
+          /* Read vector data from inTile and copy vector data to outTile */
+          IVP_LA2NX8_IP(vecValue, vaInData, pdvecIn);
+          IVP_SA2NX8_IP(vecValue, vaOutData, pdvecOut);
+          IVP_LAV2NX8_XP(vecValue, vaInData, pdvecIn, varLen);
+          IVP_SAV2NX8_XP(vecValue, vaOutData, pdvecOut, varLen);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+      }
+      else if (x < dim1CopySize)
+      {
+        /* initialize input and output data pointer */
+        int8_t *pInput1  = pInput + x + (z * inDataPitch2);
+        int8_t *pOutput1 = pOutput + x + (z * outDataPitch2);
+        int32_t varLen   = dim1CopySize - x;
+        for (y = 0; y < dim2CopySize; y++)
+        {
+          pdvecIn  = (xb_vec2Nx8 *) (pInput1 + (y * inDataPitch1));
+          pdvecOut = (xb_vec2Nx8 *) (pOutput1 + (y * outDataPitch1));
+          vaInData = IVP_LA2NX8_PP(pdvecIn);
+
+          /* Read vector data from inTile and copy vector data */
+          IVP_LAV2NX8_XP(vecValue, vaInData, pdvecIn, varLen);
+          IVP_SAV2NX8_XP(vecValue, vaOutData, pdvecOut, varLen);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/************************ xaiUnsignedToSigned3D_U8S8 ******************************/
+/* Description : P6 optimized implementation for converting the tile data from   */
+/*               unsigned 8bit to signed 8bit. This function can operate         */
+/*               in-place. Applications needing this function to operate         */
+/*               in-place can provide the same Input and Output Tiles.           */
+/* Inputs      : Input Tile                                                      */
+/* Outputs     : XI Error Code                                                   */
+/* InOuts      : Output Tile                                                     */
+/* Assumptions : InData is unsigned 8bit                                         */
+/*               Unsigned to Signed 8bit conversion not performed on tile edges  */
+/*********************************************************************************/
+XAI_ERR_TYPE xaiUnsignedToSigned3D_U8S8(xai_pTile3D inTile, xai_pTile3D outTile)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_U8(inTile);
+    XAI_CHECK_TILE3D_S8(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t inDataPitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+
+  /* Input and Output Data Pointers */
+  uint8_t *pInput = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t maxLoopCount;
+
+  /*  xaiUnsignedToSigned3D_U8S8 function support in-place unsigned to signed 8bit */
+  /*  conversion. In a such a scenario inTile and outTile will be overlapping.    */
+  /*  Hence restrict keyword is not used for input and output data pointers       */
+  xb_vec2Nx8U* restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  valign vaInData;
+  valign vaOutData = IVP_ZALIGN();
+  xb_vec2Nx8U vecValue1, vecValue2, vecValue3, vecValue4;
+  xb_vec2Nx8 vecValueSigned1, vecValueSigned2, vecValueSigned3, vecValueSigned4;
+  const xb_vec2Nx8 signedCharMax = SCHAR_MAX;
+
+  /* Vectorization for xaiUnsignedToSigned3D_U8S8 function */
+  /* is always done across the first dimension            */
+  int32_t vectorizationWidth   = 2 * XCHAL_IVPN_SIMD_WIDTH;
+  int32_t vectorizationWidth2X = 2 * vectorizationWidth;
+  int32_t vectorizationWidth3X = 3 * vectorizationWidth;
+  int32_t vectorizationWidth4X = 4 * vectorizationWidth;
+  int32_t x, y, z;
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which unsigned       */
+  /*      8 bit to signed 8 bit conversion need to done present in contiguous   */
+  /*      memory location. Hence vectorization can be utilized effectively      */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - If above condition holds good, data elements for which unsigned       */
+  /*      8 bit to signed 8 bit conversion need to done exist in non-contiguous */
+  /*      memory location. In order to do vectorization across first dimension, */
+  /*      output data pointers need to be updated based on output tile size     */
+  /*      and output tile pitch                                                 */
+  /******************************************************************************/
+
+  if ((inDataPitch1 == dim1Size) && (outDataPitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    maxLoopCount = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inDataPitch2 == maxLoopCount) && (outDataPitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+      dim3MaxLoopCount = 1;       /* Update max loop counter */
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input data pointer */
+      pdvecIn = (xb_vec2Nx8U *) (pInput + (z * inDataPitch2));
+      /* initialize output data pointer */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (z * outDataPitch2));
+      vaInData = IVP_LA2NX8U_PP(pdvecIn);
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth4X; x += vectorizationWidth4X)
+      {
+        /* Load Data */
+        IVP_LA2NX8U_IP(vecValue1, vaInData, pdvecIn);
+        IVP_LA2NX8U_IP(vecValue2, vaInData, pdvecIn);
+        IVP_LA2NX8U_IP(vecValue3, vaInData, pdvecIn);
+        IVP_LA2NX8U_IP(vecValue4, vaInData, pdvecIn);
+
+        /* Perform unsigned to signed conversion and rounding off operation */
+        vecValue1 = IVP_AVGRU2NX8(vecValue1, 0);
+        vecValue2 = IVP_AVGRU2NX8(vecValue2, 0);
+        vecValue3 = IVP_AVGRU2NX8(vecValue3, 0);
+        vecValue4 = IVP_AVGRU2NX8(vecValue4, 0);
+
+        /* Perform saturation of signed max value */
+        vecValueSigned1 = IVP_MINU2NX8U(signedCharMax, vecValue1);
+        vecValueSigned2 = IVP_MINU2NX8U(signedCharMax, vecValue2);
+        vecValueSigned3 = IVP_MINU2NX8U(signedCharMax, vecValue3);
+        vecValueSigned4 = IVP_MINU2NX8U(signedCharMax, vecValue4);
+
+        /* Store Data */
+        IVP_SA2NX8_IP(vecValueSigned1, vaOutData, pdvecOut);
+        IVP_SA2NX8_IP(vecValueSigned2, vaOutData, pdvecOut);
+        IVP_SA2NX8_IP(vecValueSigned3, vaOutData, pdvecOut);
+        IVP_SA2NX8_IP(vecValueSigned4, vaOutData, pdvecOut);
+      }
+      /* Load remaining data */
+      IVP_LAV2NX8U_XP(vecValue1, vaInData, pdvecIn, maxLoopCount - (x + vectorizationWidth3X));
+      IVP_LAV2NX8U_XP(vecValue2, vaInData, pdvecIn, maxLoopCount - (x + vectorizationWidth2X));
+      IVP_LAV2NX8U_XP(vecValue3, vaInData, pdvecIn, maxLoopCount - (x + vectorizationWidth));
+      IVP_LAV2NX8U_XP(vecValue4, vaInData, pdvecIn, maxLoopCount - x);
+
+      /* Perform unsigned to signed conversion and rounding off operation */
+      vecValue1 = IVP_AVGRU2NX8(vecValue1, 0);
+      vecValue2 = IVP_AVGRU2NX8(vecValue2, 0);
+      vecValue3 = IVP_AVGRU2NX8(vecValue3, 0);
+      vecValue4 = IVP_AVGRU2NX8(vecValue4, 0);
+
+      /* Perform saturation of signed max value */
+      vecValueSigned1 = IVP_MINU2NX8U(signedCharMax, vecValue1);
+      vecValueSigned2 = IVP_MINU2NX8U(signedCharMax, vecValue2);
+      vecValueSigned3 = IVP_MINU2NX8U(signedCharMax, vecValue3);
+      vecValueSigned4 = IVP_MINU2NX8U(signedCharMax, vecValue4);
+
+      /* Variable stores */
+      IVP_SAV2NX8_XP(vecValueSigned1, vaOutData, pdvecOut,
+                     maxLoopCount - (x + vectorizationWidth3X));
+      IVP_SAV2NX8_XP(vecValueSigned2, vaOutData, pdvecOut,
+                     maxLoopCount - (x + vectorizationWidth2X));
+      IVP_SAV2NX8_XP(vecValueSigned3, vaOutData, pdvecOut, maxLoopCount - (x + vectorizationWidth));
+      IVP_SAV2NX8_XP(vecValueSigned4, vaOutData, pdvecOut, maxLoopCount - x);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++) /* Loop across dim3 */
+    {
+      x = 0;
+      /* Loop across dimension 1 */
+      /* Condition check added to maximize vectorization across dimension 1*/
+      /* Loop across dim1 */
+      for (; x < (dim1Size - 3 * vectorizationWidth); x += 4 * vectorizationWidth)
+      {
+        /* initialize input and output data pointer */
+        uint8_t *pInput1 = pInput + x + (z * inDataPitch2);
+        int8_t *pOutput1 = pOutput + x + (z * outDataPitch2);
+        int32_t varLen   = dim1Size - (x + 3 * vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++)
+        {
+          pdvecIn  = (xb_vec2Nx8U *) (pInput1 + (y * inDataPitch1));
+          pdvecOut = (xb_vec2Nx8 *) (pOutput1 + (y * outDataPitch1));
+          vaInData = IVP_LA2NX8U_PP(pdvecIn);
+
+          /* Load Input Data */
+          IVP_LA2NX8U_IP(vecValue1, vaInData, pdvecIn);
+          IVP_LA2NX8U_IP(vecValue2, vaInData, pdvecIn);
+          IVP_LA2NX8U_IP(vecValue3, vaInData, pdvecIn);
+          IVP_LAV2NX8U_XP(vecValue4, vaInData, pdvecIn, varLen);
+
+          /* Perform unsigned to signed conversion and rounding off operation */
+          vecValue1 = IVP_AVGRU2NX8(vecValue1, 0);
+          vecValue2 = IVP_AVGRU2NX8(vecValue2, 0);
+          vecValue3 = IVP_AVGRU2NX8(vecValue3, 0);
+          vecValue4 = IVP_AVGRU2NX8(vecValue4, 0);
+
+          /* Perform saturation of signed max value */
+          vecValueSigned1 = IVP_MINU2NX8U(signedCharMax, vecValue1);
+          vecValueSigned2 = IVP_MINU2NX8U(signedCharMax, vecValue2);
+          vecValueSigned3 = IVP_MINU2NX8U(signedCharMax, vecValue3);
+          vecValueSigned4 = IVP_MINU2NX8U(signedCharMax, vecValue4);
+
+          /* Store */
+          IVP_SA2NX8_IP(vecValueSigned1, vaOutData, pdvecOut);
+          IVP_SA2NX8_IP(vecValueSigned2, vaOutData, pdvecOut);
+          IVP_SA2NX8_IP(vecValueSigned3, vaOutData, pdvecOut);
+          IVP_SAV2NX8_XP(vecValueSigned4, vaOutData, pdvecOut, varLen);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+      }
+      if (x < (dim1Size - 2 * vectorizationWidth)) /* Loop unrolling across dim2 */
+      {
+        /* initialize input and output data pointer */
+        uint8_t *pInput1 = pInput + x + (z * inDataPitch2);
+        int8_t *pOutput1 = pOutput + x + (z * outDataPitch2);
+        int32_t varLen   = dim1Size - (x + 2 * vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++)
+        {
+          pdvecIn  = (xb_vec2Nx8U *) (pInput1 + (y * inDataPitch1));
+          pdvecOut = (xb_vec2Nx8 *) (pOutput1 + (y * outDataPitch1));
+          vaInData = IVP_LA2NX8U_PP(pdvecIn);
+
+          /* Load Input Data */
+          IVP_LA2NX8U_IP(vecValue1, vaInData, pdvecIn);
+          IVP_LA2NX8U_IP(vecValue2, vaInData, pdvecIn);
+          IVP_LAV2NX8U_XP(vecValue3, vaInData, pdvecIn, varLen);
+
+          /* Perform unsigned to signed conversion and rounding off operation */
+          vecValue1 = IVP_AVGRU2NX8(vecValue1, 0);
+          vecValue2 = IVP_AVGRU2NX8(vecValue2, 0);
+          vecValue3 = IVP_AVGRU2NX8(vecValue3, 0);
+
+          /* Perform saturation of signed max value */
+          vecValueSigned1 = IVP_MINU2NX8U(signedCharMax, vecValue1);
+          vecValueSigned2 = IVP_MINU2NX8U(signedCharMax, vecValue2);
+          vecValueSigned3 = IVP_MINU2NX8U(signedCharMax, vecValue3);
+
+          /* Store */
+          IVP_SA2NX8_IP(vecValueSigned1, vaOutData, pdvecOut);
+          IVP_SA2NX8_IP(vecValueSigned2, vaOutData, pdvecOut);
+          IVP_SAV2NX8_XP(vecValueSigned3, vaOutData, pdvecOut, varLen);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* initialize input and output data pointer */
+        uint8_t *pInput1 = pInput + x + (z * inDataPitch2);
+        int8_t *pOutput1 = pOutput + x + (z * outDataPitch2);
+        int32_t varLen   = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++)
+        {
+          pdvecIn  = (xb_vec2Nx8U *) (pInput1 + (y * inDataPitch1));
+          pdvecOut = (xb_vec2Nx8 *) (pOutput1 + (y * outDataPitch1));
+          vaInData = IVP_LA2NX8U_PP(pdvecIn);
+
+          /* Load Input Data */
+          IVP_LA2NX8U_IP(vecValue1, vaInData, pdvecIn);
+          IVP_LAV2NX8U_XP(vecValue2, vaInData, pdvecIn, varLen);
+
+          /* Perform unsigned to signed conversion and rounding off operation */
+          vecValue1 = IVP_AVGRU2NX8(vecValue1, 0);
+          vecValue2 = IVP_AVGRU2NX8(vecValue2, 0);
+
+          /* Perform saturation of signed max value */
+          vecValueSigned1 = IVP_MINU2NX8U(signedCharMax, vecValue1);
+          vecValueSigned2 = IVP_MINU2NX8U(signedCharMax, vecValue2);
+
+          /* Store */
+          IVP_SA2NX8_IP(vecValueSigned1, vaOutData, pdvecOut);
+          IVP_SAV2NX8_XP(vecValueSigned2, vaOutData, pdvecOut, dim1Size - (x + vectorizationWidth));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* initialize input and output data pointer */
+        uint8_t *pInput1 = pInput + x + (z * inDataPitch2);
+        int8_t *pOutput1 = pOutput + x + (z * outDataPitch2);
+        int32_t varLen   = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++)
+        {
+          pdvecIn  = (xb_vec2Nx8U *) (pInput1 + (y * inDataPitch1));
+          pdvecOut = (xb_vec2Nx8 *) (pOutput1 + (y * outDataPitch1));
+          vaInData = IVP_LA2NX8U_PP(pdvecIn);
+
+          /* Load Input Data */
+          IVP_LAV2NX8U_XP(vecValue1, vaInData, pdvecIn, varLen);
+
+          /* Perform unsigned to signed conversion and rounding off operation */
+          vecValue1 = IVP_AVGRU2NX8(vecValue1, 0);
+
+          /* Perform saturation of signed max value */
+          vecValueSigned1 = IVP_MINU2NX8U(signedCharMax, vecValue1);
+
+          /* Store */
+          IVP_SAV2NX8_XP(vecValueSigned1, vaOutData, pdvecOut, varLen);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_S8S16 ************************/
+/* Description : P6 implementation for conversion from S8 to S16       */
+/* Inputs      : Input Tile, scale, shift                              */
+/* Outputs     : XI Error Code                                         */
+/* InOuts      : Output Tile                                           */
+/* Assumptions : InData is signed 8bit                                 */
+/***********************************************************************/
+
+XAI_ERR_TYPE xaiDataConversion3D_S8S16(const xai_pTile3D inTile,
+                                       xai_pTile3D outTile,
+                                       const uint16_t scale,
+                                       const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_TILE3D_S16(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 24", shift);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  valign vaOut = IVP_ZALIGN();
+
+  /* Get Data Pointers */
+  int8_t *pInput   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int16_t *pOutput = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx8 * restrict pvecIn;
+  xb_vecNx16 * restrict pvecOut;
+
+  /* input and output data vectors */
+  xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3;
+  xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3;
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from S8 bit to S16 bit need to done present in contiguous  */
+  /*      memory location. Hence vectorization can be utilized effectively      */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from S8 bit */
+  /*      S16 bit need to done exist in non-contiguous memory location.         */
+  /*      In order to do vectorization across first dimension, output data      */
+  /*      pointers need to be updated based on output tile size and output tile */
+  /*      pitch.                                                                */
+  /******************************************************************************/
+
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input and output vectors */
+    xb_vecNx16 vecInData;
+    xb_vecNx16 vecOut;
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx8 *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecNx16 *) (pOutput + (z * outTilePitch2));
+      valign vaInData = IVP_LANX8S_PP(pvecIn);
+      int32_t varlen;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* Load input data */
+        IVP_LANX8S_IP(vecInData, vaInData, pvecIn);
+
+        /* apply scale and shift to input data.
+         * multiplying with scale results in 32 way 48-bit
+         * data to which shift is applied, so final result is
+         * 32 way 16 bit.
+         */
+        vecOut = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData), shift);
+
+        /* store output data */
+        IVP_SANX16_IP(vecOut, vaOut, pvecOut);
+      }
+      varlen = (maxLoopCount - x);
+      IVP_LANX8S_IP(vecInData, vaInData, pvecIn);
+
+      /* apply scale and shift to input data.
+       * multiplying with scale results in 32 way 48-bit
+       * data to which shift is applied, so final result is
+       * 32 way 16 bit.
+       */
+      vecOut = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData), shift);
+
+      /* store output data */
+      IVP_SAVNX16_XP(vecOut, vaOut, pvecOut, (varlen << 1));
+      IVP_SAPOSNX16_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        int16_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          /* load input data */
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData3, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift);
+
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift);
+
+          vecOut2 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2), shift);
+
+          vecOut3 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData3), shift);
+
+          /* Store output data */
+          IVP_SANX16_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX16_IP(vecOut1, vaOut, pvecOut);
+          IVP_SANX16_IP(vecOut2, vaOut, pvecOut);
+          IVP_SAVNX16_XP(vecOut3, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        int16_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          /* load input data */
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData2, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift);
+
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift);
+
+          vecOut2 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2), shift);
+
+          /* Store output data */
+          IVP_SANX16_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX16_IP(vecOut1, vaOut, pvecOut);
+          IVP_SAVNX16_XP(vecOut2, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        int16_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift);
+
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift);
+
+          /* Store output data */
+          IVP_SANX16_IP(vecOut0, vaOut, pvecOut);
+          IVP_SAVNX16_XP(vecOut1, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        int16_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          /* load input data */
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift);
+
+          /* Store output data */
+          IVP_SAVNX16_XP(vecOut0, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_U8S8 ***********************/
+/* Description : P6 implementation for conversion from U8 to S8      */
+/* Inputs      : Input Tile, scale, shift                            */
+/* Outputs     : XI Error Code                                       */
+/* InOuts      : Output Tile                                         */
+/* Assumptions : InData is unsigned 8bit                             */
+/*********************************************************************/
+XAI_ERR_TYPE xaiDataConversion3D_U8S8(const xai_pTile3D inTile,
+                                      xai_pTile3D outTile,
+                                      const uint16_t scale,
+                                      const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_U8(inTile);
+    XAI_CHECK_TILE3D_S8(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 24", shift);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  valign vaOut = IVP_ZALIGN();
+
+  /* Get Data Pointers */
+  uint8_t *pInput = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx8U * restrict pvecIn;
+  xb_vecNx8 * restrict pvecOut;
+
+  /* input and output data vectors */
+  xb_vecNx16U vecInData0, vecInData1, vecInData2, vecInData3;
+  xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3;
+
+  /********************************************************************************/
+  /* The overall design approach is split into 2 parts                            */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch   */
+  /*    is equal to output tile pitch                                             */
+  /*    - If above condition holds good, data elements for which data             */
+  /*      conversion from U8 bit to S8 bit need to done is present in contiguous  */
+  /*      memory location. Hence vectorization can be utilized effectively        */
+  /*                                                                              */
+  /* 2. When input tile pitch is not equal to input tile size or input tile       */
+  /*    pitch is not equal to output tile pitch                                   */
+  /*    - In this scenario, data elements for which data conversion from U8 bit   */
+  /*      S8 bit need to done exist in non-contiguous memory location.            */
+  /*      In order to do vectorization across first dimension, output data        */
+  /*      pointers need to be updated based on output tile size and output tile   */
+  /*      pitch.                                                                  */
+  /********************************************************************************/
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+    /*Input and output vectors*/
+    xb_vecNx16U vecInData;
+    xb_vecNx16 vecOut;
+
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx8U *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecNx8 *) (pOutput + (z * outTilePitch2));
+      valign vaInData = IVP_LANX8U_PP(pvecIn);
+      int32_t varlen;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* Load input data */
+        IVP_LANX8U_IP(vecInData, vaInData, pvecIn);
+
+        /* apply scale and shift to input data.
+         * multiplying with scale results in 32 way 48-bit
+         * data to which shift is applied and data is truncated
+         * in the 8 bit range 0 to SCHAR_MAX. So the final result
+         * is 32-way, 8-bit.
+         */
+        vecOut = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData), shift);
+
+        /* store output data */
+        IVP_SANX8S_IP(vecOut, vaOut, pvecOut);
+      }
+      varlen = (maxLoopCount - x);
+      IVP_LANX8U_IP(vecInData, vaInData, pvecIn);
+
+      /* apply scale and shift to input data.
+       * multiplying with scale results in 32 way 48-bit
+       * data to which shift is applied and data is truncated
+       * in the 8 bit range 0 to SCHAR_MAX. So the final result
+       * is 32-way, 8-bit.
+       */
+      vecOut = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData), shift);
+
+      /* store output data */
+      IVP_SAVNX8S_XP(vecOut, vaOut, pvecOut, varlen);
+      IVP_SAPOSNX8S_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut   = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData3, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied and data is truncated
+           * in the 8 bit range 0 to SCHAR_MAX. So the final result
+           * is 32-way, 8-bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift);
+
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift);
+
+          vecOut2 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2), shift);
+
+          vecOut3 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData3), shift);
+
+          /* Store output data */
+          IVP_SANX8S_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX8S_IP(vecOut1, vaOut, pvecOut);
+          IVP_SANX8S_IP(vecOut2, vaOut, pvecOut);
+          IVP_SAVNX8S_XP(vecOut3, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8S_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut   = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData2, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied and data is truncated
+           * in the 8 bit range 0 to SCHAR_MAX. So the final result
+           * is 32-way, 8-bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift);
+
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift);
+
+          vecOut2 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2), shift);
+
+          /* Store output data */
+          IVP_SANX8S_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX8S_IP(vecOut1, vaOut, pvecOut);
+          IVP_SAVNX8S_XP(vecOut2, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8S_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut   = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData1, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied and data is truncated
+           * in the 8 bit range 0 to SCHAR_MAX. So the final result
+           * is 32-way, 8-bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift);
+
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift);
+
+          /* Store output data */
+          IVP_SANX8S_IP(vecOut0, vaOut, pvecOut);
+          IVP_SAVNX8S_XP(vecOut1, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8S_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut   = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to shift is applied and data is truncated
+           * in the 8 bit range 0 to SCHAR_MAX. So the final result
+           * is 32-way, 8-bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift);
+
+          /* Store output data */
+          IVP_SAVNX8S_XP(vecOut0, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8S_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_U8S16 ***********************/
+/* Description : P6 implementation for conversion from U8 to S16      */
+/* Inputs      : Input Tile, scale, shift                             */
+/* Outputs     : XI Error Code                                        */
+/* InOuts      : Output Tile                                          */
+/* Assumptions : InData is unsigned 8bit                              */
+/**********************************************************************/
+
+XAI_ERR_TYPE xaiDataConversion3D_U8S16(const xai_pTile3D inTile,
+                                       xai_pTile3D outTile,
+                                       const uint16_t scale,
+                                       const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_U8(inTile);
+    XAI_CHECK_TILE3D_S16(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 24", shift);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  valign vaOut = IVP_ZALIGN();
+
+  /* Get Data Pointers */
+  uint8_t *pInput  = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int16_t *pOutput = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx8U * restrict pvecIn;
+  xb_vecNx16 * restrict pvecOut;
+
+  /* input and output data vectors */
+  xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3;
+  xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3;
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from U8 bit to S16 bit need to done present in contiguous  */
+  /*      memory location. Hence vectorization can be utilized effectively      */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from U8 bit */
+  /*      S16 bit need to done exist in non-contiguous memory location.         */
+  /*      In order to do vectorization across first dimension, output data      */
+  /*      pointers need to be updated based on output tile size and output tile */
+  /*      pitch.                                                                */
+  /******************************************************************************/
+
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input and output vectors */
+    xb_vecNx16U vecInData;
+    xb_vecNx16 vecOut;
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx8U *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecNx16 *) (pOutput + (z * outTilePitch2));
+      valign vaInData = IVP_LANX8U_PP(pvecIn);
+      int32_t varlen;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* Load input data */
+        IVP_LANX8U_IP(vecInData, vaInData, pvecIn);
+
+        /* apply scale and shift to input data.
+         * multiplying with scale results in 32 way 48-bit
+         * data to which shift is applied, so final result is
+         * 32 way 16 bit.
+         */
+        vecOut = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData), shift);
+
+        /* store output data */
+        IVP_SANX16_IP(vecOut, vaOut, pvecOut);
+      }
+      varlen = (maxLoopCount - x);
+      IVP_LANX8U_IP(vecInData, vaInData, pvecIn);
+
+      /* apply scale and shift to input data.
+       * multiplying with scale results in 32 way 48-bit
+       * data to which shift is applied, so final result is
+       * 32 way 16 bit.
+       */
+      vecOut = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData), shift);
+
+      /* store output data */
+      IVP_SAVNX16_XP(vecOut, vaOut, pvecOut, (varlen << 1));
+      IVP_SAPOSNX16_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int16_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData3, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift);
+
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift);
+
+          vecOut2 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2), shift);
+
+          vecOut3 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData3), shift);
+
+          /* Store output data */
+          IVP_SANX16_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX16_IP(vecOut1, vaOut, pvecOut);
+          IVP_SANX16_IP(vecOut2, vaOut, pvecOut);
+          IVP_SAVNX16_XP(vecOut3, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int16_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData2, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift);
+
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift);
+
+          vecOut2 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2), shift);
+
+          /* Store output data */
+          IVP_SANX16_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX16_IP(vecOut1, vaOut, pvecOut);
+          IVP_SAVNX16_XP(vecOut2, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int16_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData1, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift);
+
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift);
+
+          /* Store output data */
+          IVP_SANX16_IP(vecOut0, vaOut, pvecOut);
+          IVP_SAVNX16_XP(vecOut1, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int16_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift);
+
+          /* Store output data */
+          IVP_SAVNX16_XP(vecOut0, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_U8U16 ***********************/
+/* Description : P6 implementation for conversion from U8 to U16      */
+/* Inputs      : Input Tile, scale, shift                             */
+/* Outputs     : XI Error Code                                        */
+/* InOuts      : Output Tile                                          */
+/* Assumptions : InData is unsigned 8bit                              */
+/**********************************************************************/
+
+XAI_ERR_TYPE xaiDataConversion3D_U8U16(const xai_pTile3D inTile,
+                                       xai_pTile3D outTile,
+                                       const uint16_t scale,
+                                       const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_U8(inTile);
+    XAI_CHECK_TILE3D_U16(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 24", shift);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  valign vaOut = IVP_ZALIGN();
+
+  /* Get Data Pointers */
+  uint8_t *pInput   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  uint16_t *pOutput = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx8U * restrict pvecIn;
+  xb_vecNx16U * restrict pvecOut;
+
+  /* input and output data vectors */
+  xb_vecNx16 vecInData0, vecInData1;
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from U8 bit to U16 bit need to done present in contiguous  */
+  /*      memory location. Hence vectorization can be utilized effectively      */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from U8 bit */
+  /*      U16 bit need to done exist in non-contiguous memory location.         */
+  /*      In order to do vectorization across first dimension, output data      */
+  /*      pointers need to be updated based on output tile size and output tile */
+  /*      pitch.                                                                */
+  /******************************************************************************/
+
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input and output vectors */
+    xb_vecNx16U vecInData;
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx8U *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecNx16U *) (pOutput + (z * outTilePitch2));
+      valign vaInData = IVP_LANX8U_PP(pvecIn);
+      int32_t varlen;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* Load input data */
+        IVP_LANX8U_IP(vecInData, vaInData, pvecIn);
+
+        /* apply scale and shift to input data.
+         * multiplying with scale results in 32 way 48-bit
+         * data to which shift is applied, so final result is
+         * 32 way 16 bit.
+         */
+        xb_vecN_2x32v hvecEven = IVP_PACKVRNX48_0(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData), shift);
+        xb_vecN_2x32v hvecOdd  = IVP_PACKVRNX48_1(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData), shift);
+        xb_vecNx16U vecOut     = IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecOdd, USHRT_MAX)), \
+                                              IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecEven, USHRT_MAX)), IVP_SELI_INTERLEAVE_1_EVEN);
+
+        /* store output data */
+        IVP_SANX16U_IP(vecOut, vaOut, pvecOut);
+      }
+      varlen = (maxLoopCount - x);
+      IVP_LANX8U_IP(vecInData, vaInData, pvecIn);
+
+      /* apply scale and shift to input data.
+       * multiplying with scale results in 32 way 48-bit
+       * data to which shift is applied, so final result is
+       * 32 way 16 bit.
+       */
+      xb_vecN_2x32v hvecEven = IVP_PACKVRNX48_0(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData), shift);
+      xb_vecN_2x32v hvecOdd  = IVP_PACKVRNX48_1(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData), shift);
+      xb_vecNx16U vecOut     = IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecOdd, USHRT_MAX)), \
+                                            IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecEven, USHRT_MAX)), IVP_SELI_INTERLEAVE_1_EVEN);
+
+      /* store output data */
+      IVP_SAVNX16U_XP(vecOut, vaOut, pvecOut, (varlen << 1));
+      IVP_SAPOSNX16U_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+
+      for (; x < (dim1Size - vectorizationWidth2X); x += vectorizationWidth2X)
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        uint16_t *pOut = &pOutput[z * outTilePitch2 + x];
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16U *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData1, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          xb_vecN_2x32v hvecEven0 = IVP_PACKVRNX48_0(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData0), shift);
+          xb_vecN_2x32v hvecOdd0  = IVP_PACKVRNX48_1(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData0), shift);
+          xb_vecNx16U vecOut0     = IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecOdd0, USHRT_MAX)), \
+                                                 IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecEven0, USHRT_MAX)), IVP_SELI_INTERLEAVE_1_EVEN);
+
+          xb_vecN_2x32v hvecEven1 = IVP_PACKVRNX48_0(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData1), shift);
+          xb_vecN_2x32v hvecOdd1  = IVP_PACKVRNX48_1(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData1), shift);
+          xb_vecNx16U vecOut1     = IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecOdd1, USHRT_MAX)), \
+                                                 IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecEven1, USHRT_MAX)), IVP_SELI_INTERLEAVE_1_EVEN);
+
+          /* Store output data */
+          IVP_SANX16U_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX16U_IP(vecOut1, vaOut, pvecOut);
+          IVP_SAPOSNX16U_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        uint16_t *pOut = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16U *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LAVNX8U_XP(vecInData1, vaInData, pvecIn, varLen - vectorizationWidth);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          xb_vecN_2x32v hvecEven0 = IVP_PACKVRNX48_0(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData0), shift);
+          xb_vecN_2x32v hvecOdd0  = IVP_PACKVRNX48_1(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData0), shift);
+          xb_vecNx16U vecOut0     = IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecOdd0, USHRT_MAX)), \
+                                                 IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecEven0, USHRT_MAX)), IVP_SELI_INTERLEAVE_1_EVEN);
+
+          xb_vecN_2x32v hvecEven1 = IVP_PACKVRNX48_0(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData1), shift);
+          xb_vecN_2x32v hvecOdd1  = IVP_PACKVRNX48_1(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData1), shift);
+          xb_vecNx16U vecOut1     = IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecOdd1, USHRT_MAX)), \
+                                                 IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecEven1, USHRT_MAX)), IVP_SELI_INTERLEAVE_1_EVEN);
+
+          /* Store output data */
+          IVP_SAVNX16U_XP(vecOut0, vaOut, pvecOut, (varLen << 1));
+          IVP_SAVNX16U_XP(vecOut1, vaOut, pvecOut, ((varLen - vectorizationWidth) << 1));
+          IVP_SAPOSNX16U_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_S8U8 ***********************/
+/* Description : P6 implementation for conversion from S8 to U8      */
+/* Inputs      : Input Tile, scale, shift                             */
+/* Outputs     : XI Error Code                                        */
+/* InOuts      : Output Tile                                          */
+/* Assumptions : InData is signed 8bit                              */
+/**********************************************************************/
+
+XAI_ERR_TYPE xaiDataConversion3D_S8U8(const xai_pTile3D inTile,
+                                      xai_pTile3D outTile,
+                                      const uint16_t scale,
+                                      const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_TILE3D_U8(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 24", shift);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+  const int16_t minLim        = 0;
+  const int16_t maxLim        = UCHAR_MAX;
+
+  valign vaOut = IVP_ZALIGN();
+
+  /* Get Data Pointers */
+  int8_t *pInput   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  uint8_t *pOutput = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx8 * restrict pvecIn;
+  xb_vecNx8U * restrict pvecOut;
+
+  /* input and output data vectors */
+  xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3;
+  xb_vecNx16U vecOut0, vecOut1, vecOut2, vecOut3;
+
+  /********************************************************************************/
+  /* The overall design approach is split into 2 parts                            */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch   */
+  /*    is equal to output tile pitch                                             */
+  /*    - If above condition holds good, data elements for which data             */
+  /*      conversion from S8 bit to U8 bit need to done is present in contiguous  */
+  /*      memory location. Hence vectorization can be utilized effectively        */
+  /*                                                                              */
+  /* 2. When input tile pitch is not equal to input tile size or input tile       */
+  /*    pitch is not equal to output tile pitch                                   */
+  /*    - In this scenario, data elements for which data conversion from U8 bit   */
+  /*      S8 bit need to done exist in non-contiguous memory location.            */
+  /*      In order to do vectorization across first dimension, output data        */
+  /*      pointers need to be updated based on output tile size and output tile   */
+  /*      pitch.                                                                  */
+  /********************************************************************************/
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+    /*Input and Output vectors*/
+    xb_vecNx16 vecInData;
+    xb_vecNx16 vecOut;
+
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx8 *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecNx8U *) (pOutput + (z * outTilePitch2));
+
+      valign vaInData = IVP_LANX8S_PP(pvecIn);
+      int32_t varlen;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* Load input data */
+        IVP_LANX8S_IP(vecInData, vaInData, pvecIn);
+
+        /* apply scale and shift to input data.
+         * multiplying with scale results in 32 way 48-bit
+         * data to which shift is applied and data is truncated
+         * in the 8 bit range 0 to UCHAR_MAX. So the final result
+         * is 32-way, 8-bit.
+         */
+        vecOut = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData), shift);
+        vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+        /* store output data */
+        IVP_SANX8U_IP(vecOut, vaOut, pvecOut);
+      }
+      varlen = (maxLoopCount - x);
+      IVP_LANX8S_IP(vecInData, vaInData, pvecIn);
+
+      /* apply scale and shift to input data.
+       * multiplying with scale results in 32 way 48-bit
+       * data to which shift is applied and data is truncated
+       * in the 8 bit range 0 to UCHAR_MAX. So the final result
+       * is 32-way, 8-bit.
+       */
+      vecOut = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData), shift);
+      vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+      /* store output data */
+      IVP_SAVNX8U_XP(vecOut, vaOut, pvecOut, varlen);
+      IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        uint8_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData3, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied and data is truncated
+           * in the 8 bit range 0 to UCHAR_MAX. So the final result
+           * is 32-way, 8-bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift);
+          vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift);
+          vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          vecOut2 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2), shift);
+          vecOut2 = IVP_MAXNX16(IVP_MINNX16(vecOut2, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          vecOut3 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData3), shift);
+          vecOut3 = IVP_MAXNX16(IVP_MINNX16(vecOut3, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          /* Store output data */
+          IVP_SANX8U_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX8U_IP(vecOut1, vaOut, pvecOut);
+          IVP_SANX8U_IP(vecOut2, vaOut, pvecOut);
+          IVP_SAVNX8U_XP(vecOut3, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        uint8_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData2, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied and data is truncated
+           * in the 8 bit range 0 to UCHAR_MAX. So the final result
+           * is 32-way, 8-bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift);
+          vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift);
+          vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          vecOut2 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2), shift);
+          vecOut2 = IVP_MAXNX16(IVP_MINNX16(vecOut2, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          /* Store output data */
+          IVP_SANX8U_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX8U_IP(vecOut1, vaOut, pvecOut);
+          IVP_SAVNX8U_XP(vecOut2, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        uint8_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied and data is truncated
+           * in the 8 bit range 0 to UCHAR_MAX. So the final result
+           * is 32-way, 8-bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift);
+          vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift);
+          vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          /* Store output data */
+          IVP_SANX8U_IP(vecOut0, vaOut, pvecOut);
+          IVP_SAVNX8U_XP(vecOut1, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        uint8_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to shift is applied and data is truncated
+           * in the 8 bit range 0 to UCHAR_MAX. So the final result
+           * is 32-way, 8-bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift);
+          vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          /* Store output data */
+          IVP_SAVNX8U_XP(vecOut0, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_S16 *****************************/
+/* Description : P6 implementation for conversion  S16 to S16             */
+/*               depending on Output Tile type                            */
+/* Inputs      : Input Tile, scale, shift                                 */
+/* Outputs     : XI Error Code                                            */
+/* InOuts      : Output Tile                                              */
+/* Assumptions : InData is signed 16bit                                   */
+/**************************************************************************/
+
+XAI_ERR_TYPE xaiDataConversion3D_S16(const xai_pTile3D inTile,
+                                     xai_pTile3D outTile,
+                                     const uint16_t scale,
+                                     const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S16(inTile);
+    XAI_CHECK_TILE3D_S16(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 32", shift);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  int16_t *pInput  = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int16_t *pOutput = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  valign vaOut     = IVP_ZALIGN();
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx16 * restrict pvecIn;
+  xb_vecNx16 * restrict pvecOut;
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from signed 16 bit to S16 bit need to done present in      */
+  /*      in contiguous memory location. Hence vectorization can be utilized    */
+  /*      effectively                                                           */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from signed */
+  /*      16 bit to S16 bit need to done exist in non-contiguous memory         */
+  /*      location. In order to do vectorization across first dimension,        */
+  /*      output data pointers need to be updated based on output tile size     */
+  /*      and output tile pitch                                                 */
+  /******************************************************************************/
+
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input data vectors */
+    xb_vecNx16 vecInData;
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx16 *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecNx16 *) (pOutput + (z * outTilePitch2));
+
+      valign vaInData = IVP_LANX16_PP(pvecIn);
+      xb_vecNx16 vecOut;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* load input data */
+        IVP_LANX16_IP(vecInData, vaInData, pvecIn);
+
+        /* apply scale and shift to input data.
+         * multiplying with scale results in 32 way 48-bit
+         * data to which shift is applied, so final result is
+         * 32 way 16 bit.
+         */
+        vecOut = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData), shift);
+
+        /* store output data */
+        IVP_SANX16_IP(vecOut, vaOut, pvecOut);
+      }
+      int32_t varLen = (maxLoopCount - x);
+      /* load input data */
+      IVP_LANX16_IP(vecInData, vaInData, pvecIn);
+
+      /* apply scale and shift to input data.
+       * multiplying with scale results in 32 way 48-bit
+       * data to which shift is applied, so final result is
+       * 32 way 16 bit.
+       */
+      vecOut = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData), shift);
+
+      /* store output data */
+      IVP_SAVNX16_XP(vecOut, vaOut, pvecOut, (varLen << 1));
+      IVP_SAPOSNX16_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int16_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3;
+          xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3;
+
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData3, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift);
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift);
+          vecOut2 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2), shift);
+          vecOut3 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData3), shift);
+
+          /* Store output data */
+          IVP_SANX16_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX16_IP(vecOut1, vaOut, pvecOut);
+          IVP_SANX16_IP(vecOut2, vaOut, pvecOut);
+          IVP_SAVNX16_XP(vecOut3, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int16_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0, vecInData1, vecInData2;
+          xb_vecNx16 vecOut0, vecOut1, vecOut2;
+
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData2, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift);
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift);
+          vecOut2 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2), shift);
+
+          /* Store output data */
+          IVP_SANX16_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX16_IP(vecOut1, vaOut, pvecOut);
+          IVP_SAVNX16_XP(vecOut2, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int16_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0, vecInData1;
+          xb_vecNx16 vecOut0, vecOut1;
+
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift);
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift);
+
+          /* Store output data */
+          IVP_SANX16_IP(vecOut0, vaOut, pvecOut);
+          IVP_SAVNX16_XP(vecOut1, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int16_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0;
+          xb_vecNx16 vecOut0;
+
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift);
+
+          /* Store output data */
+          IVP_SAVNX16_XP(vecOut0, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_S16I32 *****************************/
+/* Description : P6 implementation for conversion  S16 to I32             */
+/*               depending on Output Tile type                            */
+/* Inputs      : Input Tile, scale, shift                                 */
+/* Outputs     : XI Error Code                                            */
+/* InOuts      : Output Tile                                              */
+/* Assumptions : InData is signed 16bit                                   */
+/**************************************************************************/
+
+XAI_ERR_TYPE xaiDataConversion3D_S16I32(const xai_pTile3D inTile,
+                                        xai_pTile3D outTile,
+                                        const uint16_t scale,
+                                        const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S16(inTile);
+    XAI_CHECK_TILE3D_I32(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 32", shift);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  int16_t *pInput  = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int32_t *pOutput = (int32_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  valign vaOut     = IVP_ZALIGN();
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+
+  const int32_t minLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32)) ? INT_MIN : 0;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx16 * restrict pvecIn;
+  xb_vecN_2x32v * restrict pvecOut;
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from signed 16 bit to I32 bit need to done present in      */
+  /*      in contiguous memory location. Hence vectorization can be utilized    */
+  /*      effectively                                                           */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from signed */
+  /*      16 bit to I32 bit need to done exist in non-contiguous memory         */
+  /*      location. In order to do vectorization across first dimension,        */
+  /*      output data pointers need to be updated based on output tile size     */
+  /*      and output tile pitch                                                 */
+  /******************************************************************************/
+
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input data vectors */
+    xb_vecNx16 vecInData;
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx16 *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecN_2x32v *) (pOutput + (z * outTilePitch2));
+
+      valign vaInData = IVP_LANX16_PP(pvecIn);
+      xb_vecN_2x32v vecOutL, vecOutH;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* load input data */
+        IVP_LANX16_IP(vecInData, vaInData, pvecIn);
+
+        xb_vecNx48 vecOutIntm1    = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData);
+        xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1));
+        vecOutL = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+        vecOutL = IVP_MAXN_2X32(vecOutL, (xb_vecN_2x32v) minLim);
+
+        vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1));
+        vecOutH     = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+        vecOutH     = IVP_MAXN_2X32(vecOutH, (xb_vecN_2x32v) minLim);
+        /* store output data */
+        IVP_SAN_2X32_IP(vecOutL, vaOut, pvecOut);
+        IVP_SAN_2X32_IP(vecOutH, vaOut, pvecOut);
+      }
+      int32_t varLen = (maxLoopCount - x);
+      /* load input data */
+      IVP_LANX16_IP(vecInData, vaInData, pvecIn);
+
+      xb_vecNx48 vecOutIntm1    = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData);
+      xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1));
+      vecOutL = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+      vecOutL = IVP_MAXN_2X32(vecOutL, (xb_vecN_2x32v) minLim);
+
+      vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1));
+      vecOutH     = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+      vecOutH     = IVP_MAXN_2X32(vecOutH, (xb_vecN_2x32v) minLim);
+
+      /* store output data */
+      IVP_SAVN_2X32_XP(vecOutL, vaOut, pvecOut, (varLen << 2));
+      IVP_SAVN_2X32_XP(vecOutH, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1));
+      IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int32_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3;
+          xb_vecN_2x32v vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H;
+
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData3, vaInData, pvecIn);
+
+
+          xb_vecNx48 vecOutIntm1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecOutIntm2 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1);
+          xb_vecNx48 vecOutIntm3 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2);
+          xb_vecNx48 vecOutIntm4 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData3);
+
+          xb_vecN_2x64w vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1));
+          vecOut0L   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut0L   = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1));
+          vecOut0H   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut0H   = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm2), IVP_CVT64SNX48LL(vecOutIntm2));
+          vecOut1L   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut1L   = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm2), IVP_CVT64SNX48HL(vecOutIntm2));
+          vecOut1H   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut1H   = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm3), IVP_CVT64SNX48LL(vecOutIntm3));
+          vecOut2L   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut2L   = IVP_MAXN_2X32(vecOut2L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm3), IVP_CVT64SNX48HL(vecOutIntm3));
+          vecOut2H   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut2H   = IVP_MAXN_2X32(vecOut2H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm4), IVP_CVT64SNX48LL(vecOutIntm4));
+          vecOut3L   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut3L   = IVP_MAXN_2X32(vecOut3L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm4), IVP_CVT64SNX48HL(vecOutIntm4));
+          vecOut3H   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut3H   = IVP_MAXN_2X32(vecOut3H, (xb_vecN_2x32v) minLim);
+
+          /* Store output data */
+          IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut2L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut2H, vaOut, pvecOut);
+          IVP_SAVN_2X32_XP(vecOut3L, vaOut, pvecOut, (varLen << 2));
+          IVP_SAVN_2X32_XP(vecOut3H, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1));
+          IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int32_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0, vecInData1, vecInData2;
+          xb_vecN_2x32v vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H;
+
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData2, vaInData, pvecIn);
+
+          xb_vecNx48 vecOutIntm1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecOutIntm2 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1);
+          xb_vecNx48 vecOutIntm3 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2);
+
+          xb_vecN_2x64w vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1));
+          vecOut0L   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut0L   = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1));
+          vecOut0H   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut0H   = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm2), IVP_CVT64SNX48LL(vecOutIntm2));
+          vecOut1L   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut1L   = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm2), IVP_CVT64SNX48HL(vecOutIntm2));
+          vecOut1H   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut1H   = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm3), IVP_CVT64SNX48LL(vecOutIntm3));
+          vecOut2L   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut2L   = IVP_MAXN_2X32(vecOut2L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm3), IVP_CVT64SNX48HL(vecOutIntm3));
+          vecOut2H   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut2H   = IVP_MAXN_2X32(vecOut2H, (xb_vecN_2x32v) minLim);
+
+          /* Store output data */
+          IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAVN_2X32_XP(vecOut2L, vaOut, pvecOut, (varLen << 2));
+          IVP_SAVN_2X32_XP(vecOut2H, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1));
+          IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int32_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0, vecInData1;
+          xb_vecN_2x32v vecOut0L, vecOut0H, vecOut1L, vecOut1H;
+
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+
+
+          xb_vecNx48 vecOutIntm1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecOutIntm2 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1);
+
+          xb_vecN_2x64w vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1));
+          vecOut0L   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut0L   = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1));
+          vecOut0H   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut0H   = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm2), IVP_CVT64SNX48LL(vecOutIntm2));
+          vecOut1L   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut1L   = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm2), IVP_CVT64SNX48HL(vecOutIntm2));
+          vecOut1H   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut1H   = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim);
+
+          /* Store output data */
+          IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAVN_2X32_XP(vecOut1L, vaOut, pvecOut, (varLen << 2));
+          IVP_SAVN_2X32_XP(vecOut1H, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1));
+          IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int32_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0;
+          xb_vecN_2x32v vecOut0L, vecOut0H;
+
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+
+          xb_vecNx48 vecOutIntm1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0);
+
+          xb_vecN_2x64w vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1));
+          vecOut0L   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut0L   = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1));
+          vecOut0H   = IVP_PACKVRN_2X64W(vecOutIntm, shift);
+          vecOut0H   = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim);
+
+          /* Store output data */
+          IVP_SAVN_2X32_XP(vecOut0L, vaOut, pvecOut, (varLen << 2));
+          IVP_SAVN_2X32_XP(vecOut0H, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1));
+          IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_U16I32 *****************************/
+/* Description : P6 implementation for conversion  U16 to I32             */
+/*               depending on Output Tile type                            */
+/* Inputs      : Input Tile, scale, shift                                 */
+/* Outputs     : XI Error Code                                            */
+/* InOuts      : Output Tile                                              */
+/* Assumptions : InData is un-signed 16bit                                   */
+/**************************************************************************/
+
+XAI_ERR_TYPE xaiDataConversion3D_U16I32(const xai_pTile3D inTile,
+                                        xai_pTile3D outTile,
+                                        const uint16_t scale,
+                                        const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_U16(inTile);
+    XAI_CHECK_TILE3D_I32(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 32", shift);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  uint16_t *pInput = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int32_t *pOutput = (int32_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  valign vaOut     = IVP_ZALIGN();
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+  const uint32_t rndVal              = (1 << (shift - 1));
+  const int32_t minLim               = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32)) ? 0 : INT_MIN;
+  /******************************************************************************************************/
+  /*usage of minLim																				                                              */
+  /*U16 x U16 = U32 - result is in U32. We have two output variants S32 and U32				                  */
+  /*For S32 output we need to clamp i.e.,(MIN(res,INT_MAX)) result using S32_MAX				                */
+  /*For U32 output we need to clamp i.e., (MIN(res,UINT_MAX)) result using U32_MAX				              */
+  /*PACK ISA available (IVP_PACKVRN_2X64W) will clamp the result to S32 range only				              */
+  /*one option to implement this is to write two APIs with change only in clamping operation -	        */
+  /*Note : we don't prefer using an if inside loop                                                      */
+  /*To avoid above condition below code uses a hack - Final res is in S32 container so -		            */
+  /* U32 to S32 can be done by MAX(0,res) and U32 to U32 can be done by MAX(INT_MIN,res)		            */
+  /* MAX(0,res) will work because all values above S32_MAX will be interpretted as < 0 in S32 container	*/
+  /******************************************************************************************************/
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx16U * restrict pvecIn;
+  xb_vecN_2x32v * restrict pvecOut;
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from unsigned 16 bit to I32 bit need to done present in      */
+  /*      in contiguous memory location. Hence vectorization can be utilized    */
+  /*      effectively                                                           */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from unsigned */
+  /*      16 bit to I32 bit need to done exist in non-contiguous memory         */
+  /*      location. In order to do vectorization across first dimension,        */
+  /*      output data pointers need to be updated based on output tile size     */
+  /*      and output tile pitch                                                 */
+  /******************************************************************************/
+
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input data vectors */
+    xb_vecNx16U vecInData;
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx16U *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecN_2x32v *) (pOutput + (z * outTilePitch2));
+
+      valign vaInData = IVP_LANX16U_PP(pvecIn);
+      xb_vecN_2x32v vecOutL, vecOutH;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* load input data */
+        IVP_LANX16_IP(vecInData, vaInData, pvecIn);
+
+        xb_vecNx48 vecOutIntm1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData);
+
+        xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1));
+        IVP_MULUUAN_2X16X32_0(vecOutIntm2, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+        vecOutL = IVP_PACKVRNRN_2X64W(vecOutIntm2, shift);
+        vecOutL = IVP_MAXN_2X32(vecOutL, (xb_vecN_2x32v) minLim);
+
+        vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1));
+        IVP_MULUUAN_2X16X32_0(vecOutIntm2, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+        vecOutH = IVP_PACKVRNRN_2X64W(vecOutIntm2, shift);
+        vecOutH = IVP_MAXN_2X32(vecOutH, (xb_vecN_2x32v) minLim);
+        /* store output data */
+        IVP_SAN_2X32_IP(vecOutL, vaOut, pvecOut);
+        IVP_SAN_2X32_IP(vecOutH, vaOut, pvecOut);
+      }
+      int32_t varLen = (maxLoopCount - x);
+      /* load input data */
+      IVP_LANX16_IP(vecInData, vaInData, pvecIn);
+
+      xb_vecNx48 vecOutIntm1    = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData);
+      xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1));
+      IVP_MULUUAN_2X16X32_0(vecOutIntm2, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+      vecOutL = IVP_PACKVRNRN_2X64W(vecOutIntm2, shift);
+      vecOutL = IVP_MAXN_2X32(vecOutL, (xb_vecN_2x32v) minLim);
+
+
+      vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1));
+      IVP_MULUUAN_2X16X32_0(vecOutIntm2, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+      vecOutH = IVP_PACKVRNRN_2X64W(vecOutIntm2, shift);
+      vecOutH = IVP_MAXN_2X32(vecOutH, (xb_vecN_2x32v) minLim);
+
+      /* store output data */
+      IVP_SAVN_2X32_XP(vecOutL, vaOut, pvecOut, (varLen << 2));
+      IVP_SAVN_2X32_XP(vecOutH, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1));
+      IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        uint16_t * pIn = &pInput[z * inTilePitch2 + x];
+        int32_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3;
+          xb_vecN_2x32v vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H;
+
+          pvecIn  = (xb_vecNx16U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16U_PP(pvecIn);
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData3, vaInData, pvecIn);
+
+
+          xb_vecNx48 vecOutIntm1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecOutIntm2 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1);
+          xb_vecNx48 vecOutIntm3 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2);
+          xb_vecNx48 vecOutIntm4 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData3);
+
+          xb_vecN_2x64w vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut0L   = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut0L   = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut0H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm2), IVP_CVT64SNX48LL(vecOutIntm2));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut1L   = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut1L   = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm2), IVP_CVT64SNX48HL(vecOutIntm2));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut1H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut1H = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm3), IVP_CVT64SNX48LL(vecOutIntm3));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut2L   = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut2L   = IVP_MAXN_2X32(vecOut2L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm3), IVP_CVT64SNX48HL(vecOutIntm3));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut2H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut2H = IVP_MAXN_2X32(vecOut2H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm4), IVP_CVT64SNX48LL(vecOutIntm4));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut3L   = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut3L   = IVP_MAXN_2X32(vecOut3L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm4), IVP_CVT64SNX48HL(vecOutIntm4));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut3H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut3H = IVP_MAXN_2X32(vecOut3H, (xb_vecN_2x32v) minLim);
+
+          /* Store output data */
+          IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut2L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut2H, vaOut, pvecOut);
+          IVP_SAVN_2X32_XP(vecOut3L, vaOut, pvecOut, (varLen << 2));
+          IVP_SAVN_2X32_XP(vecOut3H, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1));
+          IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        uint16_t * pIn = &pInput[z * inTilePitch2 + x];
+        int32_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0, vecInData1, vecInData2;
+          xb_vecN_2x32v vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H;
+
+          pvecIn  = (xb_vecNx16U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData2, vaInData, pvecIn);
+
+          xb_vecNx48 vecOutIntm1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecOutIntm2 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1);
+          xb_vecNx48 vecOutIntm3 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2);
+
+          xb_vecN_2x64w vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut0L   = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut0L   = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut0H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm2), IVP_CVT64SNX48LL(vecOutIntm2));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut1L   = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut1L   = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm2), IVP_CVT64SNX48HL(vecOutIntm2));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut1H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut1H = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm3), IVP_CVT64SNX48LL(vecOutIntm3));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut2L   = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut2L   = IVP_MAXN_2X32(vecOut2L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm3), IVP_CVT64SNX48HL(vecOutIntm3));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut2H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut2H = IVP_MAXN_2X32(vecOut2H, (xb_vecN_2x32v) minLim);
+
+          /* Store output data */
+          IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAVN_2X32_XP(vecOut2L, vaOut, pvecOut, (varLen << 2));
+          IVP_SAVN_2X32_XP(vecOut2H, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1));
+          IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        uint16_t * pIn = &pInput[z * inTilePitch2 + x];
+        int32_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0, vecInData1;
+          xb_vecN_2x32v vecOut0L, vecOut0H, vecOut1L, vecOut1H;
+
+          pvecIn  = (xb_vecNx16U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+
+
+          xb_vecNx48 vecOutIntm1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecOutIntm2 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1);
+
+          xb_vecN_2x64w vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut0L   = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut0L   = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut0H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim);
+
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm2), IVP_CVT64SNX48LL(vecOutIntm2));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut1L   = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut1L   = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm2), IVP_CVT64SNX48HL(vecOutIntm2));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut1H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut1H = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim);
+
+          /* Store output data */
+          IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAVN_2X32_XP(vecOut1L, vaOut, pvecOut, (varLen << 2));
+          IVP_SAVN_2X32_XP(vecOut1H, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1));
+          IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        uint16_t * pIn = &pInput[z * inTilePitch2 + x];
+        int32_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0;
+          xb_vecN_2x32v vecOut0L, vecOut0H;
+
+          pvecIn  = (xb_vecNx16U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+
+          xb_vecNx48 vecOutIntm1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0);
+
+          xb_vecN_2x64w vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut0L   = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut0L   = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim);
+          vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1));
+          IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal);
+          vecOut0H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift);
+          vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim);
+
+          /* Store output data */
+          IVP_SAVN_2X32_XP(vecOut0L, vaOut, pvecOut, (varLen << 2));
+          IVP_SAVN_2X32_XP(vecOut0H, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1));
+          IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_U16S16 **************************/
+/* Description : P6 implementation for conversion  U16 to S16             */
+/*               depending on Output Tile type                            */
+/* Inputs      : Input Tile, scale, shift                                 */
+/* Outputs     : XI Error Code                                            */
+/* InOuts      : Output Tile                                              */
+/* Assumptions : InData is unsigned 16bit                                 */
+/**************************************************************************/
+
+XAI_ERR_TYPE xaiDataConversion3D_U16S16(const xai_pTile3D inTile,
+                                        xai_pTile3D outTile,
+                                        const uint16_t scale,
+                                        const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_U16(inTile);
+    XAI_CHECK_TILE3D_S16(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 32", shift);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  uint16_t *pInput = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int16_t *pOutput = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  valign vaOut     = IVP_ZALIGN();
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx16U * restrict pvecIn;
+  xb_vecNx16 * restrict pvecOut;
+
+  /********************************************************************************/
+  /* The overall design approach is split into 2 parts                            */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch   */
+  /*    is equal to output tile pitch                                             */
+  /*    - If above condition holds good, data elements for which data             */
+  /*      conversion from unsigned 16 bit to S16 bit need to done present in      */
+  /*      in contiguous memory location. Hence vectorization can be utilized      */
+  /*      effectively                                                             */
+  /*                                                                              */
+  /* 2. When input tile pitch is not equal to input tile size or input tile       */
+  /*    pitch is not equal to output tile pitch                                   */
+  /*    - In this scenario, data elements for which data conversion from unsigned */
+  /*      16 bit to S16 bit need to done exist in non-contiguous memory           */
+  /*      location. In order to do vectorization across first dimension,          */
+  /*      output data pointers need to be updated based on output tile size       */
+  /*      and output tile pitch                                                   */
+  /********************************************************************************/
+
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input data vectors */
+    xb_vecNx16U vecInData;
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx16U *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecNx16 *) (pOutput + (z * outTilePitch2));
+
+      valign vaInData = IVP_LANX16U_PP(pvecIn);
+      xb_vecNx16 vecOut;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* load input data */
+        IVP_LANX16U_IP(vecInData, vaInData, pvecIn);
+
+        /* apply scale and shift to input data.
+         * multiplying with scale results in 32 way 48-bit
+         * data to which shift is applied, so final result is
+         * 32 way 16 bit.
+         */
+        vecOut = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData), shift);
+
+        /* store output data */
+        IVP_SANX16_IP(vecOut, vaOut, pvecOut);
+      }
+      int32_t varLen = (maxLoopCount - x);
+      /* load input data */
+      IVP_LANX16U_IP(vecInData, vaInData, pvecIn);
+
+      /* apply scale and shift to input data.
+       * multiplying with scale results in 32 way 48-bit
+       * data to which shift is applied, so final result is
+       * 32 way 16 bit.
+       */
+      vecOut = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData), shift);
+
+      /* store output data */
+      IVP_SAVNX16_XP(vecOut, vaOut, pvecOut, (varLen << 1));
+      IVP_SAPOSNX16_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        uint16_t * pIn = &pInput[z * inTilePitch2 + x];
+        int16_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3;
+          xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3;
+
+          pvecIn  = (xb_vecNx16U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16U_PP(pvecIn);
+          /* load input data */
+          IVP_LANX16U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16U_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16U_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX16U_IP(vecInData3, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift);
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift);
+          vecOut2 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2), shift);
+          vecOut3 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData3), shift);
+
+          /* Store output data */
+          IVP_SANX16_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX16_IP(vecOut1, vaOut, pvecOut);
+          IVP_SANX16_IP(vecOut2, vaOut, pvecOut);
+          IVP_SAVNX16_XP(vecOut3, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        uint16_t * pIn = &pInput[z * inTilePitch2 + x];
+        int16_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16U vecInData0, vecInData1, vecInData2;
+          xb_vecNx16 vecOut0, vecOut1, vecOut2;
+
+          pvecIn  = (xb_vecNx16U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16U_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16U_IP(vecInData2, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift);
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift);
+          vecOut2 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2), shift);
+
+          /* Store output data */
+          IVP_SANX16_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX16_IP(vecOut1, vaOut, pvecOut);
+          IVP_SAVNX16_XP(vecOut2, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        uint16_t * pIn = &pInput[z * inTilePitch2 + x];
+        int16_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16U vecInData0, vecInData1;
+          xb_vecNx16 vecOut0, vecOut1;
+
+          pvecIn  = (xb_vecNx16U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift);
+          vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift);
+
+          /* Store output data */
+          IVP_SANX16_IP(vecOut0, vaOut, pvecOut);
+          IVP_SAVNX16_XP(vecOut1, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        uint16_t * pIn = &pInput[z * inTilePitch2 + x];
+        int16_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0;
+          xb_vecNx16 vecOut0;
+
+          pvecIn  = (xb_vecNx16U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift);
+
+          /* Store output data */
+          IVP_SAVNX16_XP(vecOut0, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_S16U16 **************************/
+/* Description : P6 implementation for conversion  S16 to U16             */
+/*               depending on Output Tile type                            */
+/* Inputs      : Input Tile, scale, shift                                 */
+/* Outputs     : XI Error Code                                            */
+/* InOuts      : Output Tile                                              */
+/* Assumptions : InData is unsigned 16bit                                 */
+/**************************************************************************/
+
+XAI_ERR_TYPE xaiDataConversion3D_S16U16(const xai_pTile3D inTile,
+                                        xai_pTile3D outTile,
+                                        const uint16_t scale,
+                                        const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S16(inTile);
+    XAI_CHECK_TILE3D_U16(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 32", shift);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  int16_t *pInput   = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  uint16_t *pOutput = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  valign vaOut      = IVP_ZALIGN();
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx16 * restrict pvecIn;
+  xb_vecNx16U * restrict pvecOut;
+
+  /********************************************************************************/
+  /* The overall design approach is split into 2 parts                            */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch   */
+  /*    is equal to output tile pitch                                             */
+  /*    - If above condition holds good, data elements for which data             */
+  /*      conversion from unsigned 16 bit to S16 bit need to done present in      */
+  /*      in contiguous memory location. Hence vectorization can be utilized      */
+  /*      effectively                                                             */
+  /*                                                                              */
+  /* 2. When input tile pitch is not equal to input tile size or input tile       */
+  /*    pitch is not equal to output tile pitch                                   */
+  /*    - In this scenario, data elements for which data conversion from unsigned */
+  /*      16 bit to S16 bit need to done exist in non-contiguous memory           */
+  /*      location. In order to do vectorization across first dimension,          */
+  /*      output data pointers need to be updated based on output tile size       */
+  /*      and output tile pitch                                                   */
+  /********************************************************************************/
+
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input data vectors */
+    xb_vecNx16 vecInData;
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx16 *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecNx16U *) (pOutput + (z * outTilePitch2));
+
+      valign vaInData = IVP_LANX16_PP(pvecIn);
+      xb_vecNx16 vecOut;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* load input data */
+        IVP_LANX16_IP(vecInData, vaInData, pvecIn);
+
+        /* apply scale and shift to input data.
+         * multiplying with scale results in 32 way 48-bit
+         * data to which shift is applied, so final result is
+         * 32 way 16 bit.
+         */
+        PACK_ROUND_U16(vecOut, vecInData, scale, shift);
+        /* store output data */
+        IVP_SANX16U_IP(vecOut, vaOut, pvecOut);
+      }
+      int32_t varLen = (maxLoopCount - x);
+      /* load input data */
+      IVP_LANX16_IP(vecInData, vaInData, pvecIn);
+
+      /* apply scale and shift to input data.
+       * multiplying with scale results in 32 way 48-bit
+       * data to which shift is applied, so final result is
+       * 32 way 16 bit.
+       */
+      PACK_ROUND_U16(vecOut, vecInData, scale, shift);
+      /* store output data */
+      IVP_SAVNX16U_XP(vecOut, vaOut, pvecOut, (varLen << 1));
+      IVP_SAPOSNX16U_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        uint16_t *pOut = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3;
+          xb_vecNx16U vecOut0, vecOut1, vecOut2, vecOut3;
+
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16U *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData3, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+
+          PACK_ROUND_U16(vecOut0, vecInData0, scale, shift);
+          PACK_ROUND_U16(vecOut1, vecInData1, scale, shift);
+          PACK_ROUND_U16(vecOut2, vecInData2, scale, shift);
+          PACK_ROUND_U16(vecOut3, vecInData3, scale, shift);
+
+          /* Store output data */
+          IVP_SANX16U_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX16U_IP(vecOut1, vaOut, pvecOut);
+          IVP_SANX16U_IP(vecOut2, vaOut, pvecOut);
+          IVP_SAVNX16U_XP(vecOut3, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16U_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        uint16_t *pOut = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0, vecInData1, vecInData2;
+          xb_vecNx16U vecOut0, vecOut1, vecOut2;
+
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16U *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData2, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          PACK_ROUND_U16(vecOut0, vecInData0, scale, shift);
+          PACK_ROUND_U16(vecOut1, vecInData1, scale, shift);
+          PACK_ROUND_U16(vecOut2, vecInData2, scale, shift);
+
+          /* Store output data */
+          IVP_SANX16U_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX16U_IP(vecOut1, vaOut, pvecOut);
+          IVP_SAVNX16U_XP(vecOut2, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16U_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        uint16_t *pOut = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0, vecInData1;
+          xb_vecNx16U vecOut0, vecOut1;
+
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16U *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          PACK_ROUND_U16(vecOut0, vecInData0, scale, shift);
+          PACK_ROUND_U16(vecOut1, vecInData1, scale, shift);
+
+          /* Store output data */
+          IVP_SANX16U_IP(vecOut0, vaOut, pvecOut);
+          IVP_SAVNX16U_XP(vecOut1, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16U_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        uint16_t *pOut = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0;
+          xb_vecNx16U vecOut0;
+
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx16U *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          PACK_ROUND_U16(vecOut0, vecInData0, scale, shift);
+          /* Store output data */
+          IVP_SAVNX16U_XP(vecOut0, vaOut, pvecOut, (varLen << 1));
+          IVP_SAPOSNX16U_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_S8I64 *****************************/
+/* Description : P6 implementation for conversion  S8 to I64             */
+/*               depending on Output Tile type                            */
+/* Inputs      : Input Tile, scale, shift                                 */
+/* Outputs     : XI Error Code                                            */
+/* InOuts      : Output Tile                                              */
+/* Assumptions : InData is signed 8bit                                   */
+/**************************************************************************/
+XAI_ERR_TYPE xaiDataConversion3D_S8I64(const xai_pTile3D inTile,
+                                       xai_pTile3D outTile,
+                                       const uint16_t scale,
+                                       const uint8_t shift)
+{
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_TILE3D_I64(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 24", shift);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  const int32_t dim1Size             = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size             = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size             = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1         = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2         = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1        = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2        = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+  valign vaOut                       = IVP_ZALIGN();
+  int8_t *pInput                     = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int64_t *pOutput                   = (int64_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+  const int32_t minLim               = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64)) ? INT_MIN : 0;
+  //S16 x U16 = S32 , rounded and shifted back to S32.
+  //even though the output type is S64. Data is within S32 range so INT_MIN is sufficient.
+  int32_t x, y, z;
+  xb_vecNx8 *restrict pvecIn;
+  xb_vecN_2x64w *restrict pvecOut;
+
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile dim1 and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from S8 bit to I64 bit need to done present in contiguous  */
+  /*      memory location. Hence vectorization can be utilized effectively      */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from S8 bit */
+  /*      I64 bit need to done exist in non-contiguous memory location.         */
+  /*      In order to do vectorization across first dimension, output data      */
+  /*      pointers need to be updated based on output tile size and output tile */
+  /*      pitch.                                                                */
+  /******************************************************************************/
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input and output vectors */
+    xb_vecNx16 vecInData;
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      xb_vecN_2x32v vecOutTempL, vecOutTempH;
+      xb_vecN_2x64w vecOutL, vecOutH;
+      pvecIn  = (xb_vecNx8 *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecN_2x64w *) (pOutput + (z * outTilePitch2));
+      valign vaInData = IVP_LANX8S_PP(pvecIn);
+      int32_t varlen;
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        IVP_LANX8S_IP(vecInData, vaInData, pvecIn);
+        xb_vecNx48 vecIntRes      = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData);
+        xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes));
+        vecOutTempL = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+        vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+        vecOutL     = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+        xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes));
+        vecOutTempH = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+        vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+        vecOutH     = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+        IVP_SAN_2X64W_IP(vecOutL, vaOut, pvecOut);
+        IVP_SAN_2X64W_IP(vecOutH, vaOut, pvecOut);
+      }
+
+      varlen = (maxLoopCount - x);
+      IVP_LANX8S_IP(vecInData, vaInData, pvecIn);
+      xb_vecNx48 vecIntRes      = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData);
+      xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes));
+      vecOutTempL = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+      vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+      vecOutL     = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+      xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes));
+      vecOutTempH = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+      vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+      vecOutH     = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+      IVP_SAVN_2X64W_XP(vecOutL, vaOut, pvecOut, (varlen << 3));
+      IVP_SAVN_2X64W_XP(vecOutH, vaOut, pvecOut, ((varlen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+      IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3;
+    xb_vecN_2x64w vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H;
+    xb_vecN_2x32v vecOutTempL, vecOutTempH;
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData3, vaInData, pvecIn);
+          xb_vecNx48 vecIntRes0 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecIntRes1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1);
+          xb_vecNx48 vecIntRes2 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2);
+          xb_vecNx48 vecIntRes3 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData3);
+          vecOut0L    = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0H    = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+          vecOut1L    = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1));
+          vecOut1H    = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1));
+          vecOut2L    = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2));
+          vecOut2H    = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2));
+          vecOut3L    = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes3), IVP_CVT64SNX48LL(vecIntRes3));
+          vecOut3H    = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes3), IVP_CVT64SNX48HL(vecIntRes3));
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          vecOut0L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          vecOut0H    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          vecOut1L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          vecOut1H    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut2L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          vecOut2L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut2H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          vecOut2H    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut3L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          vecOut3L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut3H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          vecOut3H    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+          IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut2L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut2H, vaOut, pvecOut);
+          IVP_SAVN_2X64W_XP(vecOut3L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut3H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData2, vaInData, pvecIn);
+          xb_vecNx48 vecIntRes0 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecIntRes1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1);
+          xb_vecNx48 vecIntRes2 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2);
+          vecOut0L    = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0H    = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+          vecOut1L    = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1));
+          vecOut1H    = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1));
+          vecOut2L    = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2));
+          vecOut2H    = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2));
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          vecOut0L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          vecOut0H    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          vecOut1L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          vecOut1H    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut2L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          vecOut2L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut2H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          vecOut2H    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+          IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAVN_2X64W_XP(vecOut2L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut2H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+          xb_vecNx48 vecIntRes0 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecIntRes1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1);
+          vecOut0L    = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0H    = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+          vecOut1L    = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1));
+          vecOut1H    = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1));
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          vecOut0L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          vecOut0H    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          vecOut1L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          vecOut1H    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+          IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAVN_2X64W_XP(vecOut1L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut1H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          xb_vecNx48 vecIntRes0 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0);
+          vecOut0L    = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0H    = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          vecOut0L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          vecOut0H    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+          IVP_SAVN_2X64W_XP(vecOut0L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut0H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_U8I64 *****************************/
+/* Description : P6 implementation for conversion  U8 to I64             */
+/*               depending on Output Tile type                            */
+/* Inputs      : Input Tile, scale, shift                                 */
+/* Outputs     : XI Error Code                                            */
+/* InOuts      : Output Tile                                              */
+/* Assumptions : InData is unsigned 8bit                                   */
+/**************************************************************************/
+XAI_ERR_TYPE xaiDataConversion3D_U8I64(const xai_pTile3D inTile,
+                                       xai_pTile3D outTile,
+                                       const uint16_t scale,
+                                       const uint8_t shift)
+{
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_U8(inTile);
+    XAI_CHECK_TILE3D_I64(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 24", shift);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+  valign vaOut                = IVP_ZALIGN();
+
+  /* Get Data Pointers */
+  uint8_t *pInput  = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int64_t *pOutput = (int64_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx8U *restrict pvecIn;
+  xb_vecN_2x64w *restrict pvecOut;
+
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile dim1 and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from U8 bit to I64 bit need to done present in contiguous  */
+  /*      memory location. Hence vectorization can be utilized effectively      */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from U8 bit */
+  /*      I64 bit need to done exist in non-contiguous memory location.         */
+  /*      In order to do vectorization across first dimension, output data      */
+  /*      pointers need to be updated based on output tile size and output tile */
+  /*      pitch.                                                                */
+  /******************************************************************************/
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input and output vectors */
+    xb_vecNx16U vecInData;
+
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      xb_vecN_2x32v vecOutTempL, vecOutTempH;
+      xb_vecN_2x64w vecOutL, vecOutH;
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx8U *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecN_2x64w *) (pOutput + (z * outTilePitch2));
+      valign vaInData = IVP_LANX8U_PP(pvecIn);
+      int32_t varlen;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* Load input data */
+        IVP_LANX8U_IP(vecInData, vaInData, pvecIn);
+
+        xb_vecNx48 vecIntRes      = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData);
+        xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes));
+        vecOutTempL = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+        //sign extending to 64bit
+        vecOutL = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+        xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes));
+        vecOutTempH = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+        //sign extending to 64bit
+        vecOutH = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+        IVP_SAN_2X64W_IP(vecOutL, vaOut, pvecOut);
+        IVP_SAN_2X64W_IP(vecOutH, vaOut, pvecOut);
+      }
+      varlen = (maxLoopCount - x);
+      IVP_LANX8U_IP(vecInData, vaInData, pvecIn);
+
+      xb_vecNx48 vecIntRes      = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData);
+      xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes));
+      vecOutTempL = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+      //sign extending to 64bit
+      vecOutL = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+      xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes));
+      vecOutTempH = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+      vecOutH     = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+      /* store output data */
+      IVP_SAVN_2X64W_XP(vecOutL, vaOut, pvecOut, (varlen << 3));
+      IVP_SAVN_2X64W_XP(vecOutH, vaOut, pvecOut, ((varlen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+      IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+    xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3;
+    xb_vecN_2x64w vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H;
+    xb_vecN_2x32v vecOutTempL, vecOutTempH;
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData3, vaInData, pvecIn);
+
+          xb_vecNx48 vecIntRes0 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecIntRes1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1);
+          xb_vecNx48 vecIntRes2 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2);
+          xb_vecNx48 vecIntRes3 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData3);
+
+          vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+          vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1));
+          vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1));
+          vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2));
+          vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2));
+          vecOut3L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes3), IVP_CVT64SNX48LL(vecIntRes3));
+          vecOut3H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes3), IVP_CVT64SNX48HL(vecIntRes3));
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift);
+          //sign extending to 64bit
+          vecOut0L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift);
+          //sign extending to 64bit
+          vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift);
+          //sign extending to 64bit
+          vecOut1L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift);
+          //sign extending to 64bit
+          vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut2L, shift);
+          //sign extending to 64bit
+          vecOut2L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut2H, shift);
+          //sign extending to 64bit
+          vecOut2H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut3L, shift);
+          //sign extending to 64bit
+          vecOut3L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut3H, shift);
+          //sign extending to 64bit
+          vecOut3H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+
+          /* Store output data */
+          IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut2L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut2H, vaOut, pvecOut);
+          IVP_SAVN_2X64W_XP(vecOut3L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut3H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData2, vaInData, pvecIn);
+
+          xb_vecNx48 vecIntRes0 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecIntRes1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1);
+          xb_vecNx48 vecIntRes2 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2);
+
+          vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+          vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1));
+          vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1));
+          vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2));
+          vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2));
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift);
+          //sign extending to 64bit
+          vecOut0L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift);
+          //sign extending to 64bit
+          vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift);
+          //sign extending to 64bit
+          vecOut1L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift);
+          //sign extending to 64bit
+          vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut2L, shift);
+          //sign extending to 64bit
+          vecOut2L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut2H, shift);
+          //sign extending to 64bit
+          vecOut2H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+
+          /* Store output data */
+          IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAVN_2X64W_XP(vecOut2L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut2H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData1, vaInData, pvecIn);
+
+          xb_vecNx48 vecIntRes0 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecIntRes1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1);
+
+          vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+          vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1));
+          vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1));
+
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift);
+          //sign extending to 64bit
+          vecOut0L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift);
+          //sign extending to 64bit
+          vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift);
+          //sign extending to 64bit
+          vecOut1L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift);
+          //sign extending to 64bit
+          vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+
+          /* Store output data */
+          IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAVN_2X64W_XP(vecOut1L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut1H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+
+          xb_vecNx48 vecIntRes0 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0);
+
+          vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift);
+          //sign extending to 64bit
+          vecOut0L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift);
+          //sign extending to 64bit
+          vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          /* Store output data */
+          IVP_SAVN_2X64W_XP(vecOut0L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut0H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_S16I64 *****************************/
+/* Description : P6 implementation for conversion  S16 to I64             */
+/*               depending on Output Tile type                            */
+/* Inputs      : Input Tile, scale, shift                                 */
+/* Outputs     : XI Error Code                                            */
+/* InOuts      : Output Tile                                              */
+/* Assumptions : InData is signed 16bit                                   */
+/**************************************************************************/
+XAI_ERR_TYPE xaiDataConversion3D_S16I64(const xai_pTile3D inTile,
+                                        xai_pTile3D outTile,
+                                        const uint16_t scale,
+                                        const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S16(inTile);
+    XAI_CHECK_TILE3D_I64(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 24", shift);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  valign vaOut = IVP_ZALIGN();
+
+  /* Get Data Pointers */
+  int16_t *pInput  = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int64_t *pOutput = (int64_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+  const int32_t minLim               = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64)) ? INT_MIN : 0;
+  //S16 x U16 = S32 , rounded and shifted back to S32.
+  //even though the output type is S64. Data is within S32 range so INT_MIN is sufficient.
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx16 *restrict pvecIn;
+  xb_vecN_2x64w *restrict pvecOut;
+
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile dim1 and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from S16 bit to I64 bit need to done present in contiguous  */
+  /*      memory location. Hence vectorization can be utilized effectively      */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from S16 bit */
+  /*      I64 bit need to done exist in non-contiguous memory location.         */
+  /*      In order to do vectorization across first dimension, output data      */
+  /*      pointers need to be updated based on output tile size and output tile */
+  /*      pitch.                                                                */
+  /******************************************************************************/
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input and output vectors */
+    xb_vecNx16 vecInData;
+
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      xb_vecN_2x32v vecOutTempL, vecOutTempH;
+      xb_vecN_2x64w vecOutL, vecOutH;
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx16 *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecN_2x64w *) (pOutput + (z * outTilePitch2));
+      valign vaInData = IVP_LANX16_PP(pvecIn);
+      int32_t varlen;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* Load input data */
+        IVP_LANX16_IP(vecInData, vaInData, pvecIn);
+
+        xb_vecNx48 vecIntRes      = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData);
+        xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes));
+        vecOutTempL = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+        vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+
+        //sign extending to 64bit
+        vecOutL = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+        xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes));
+        vecOutTempH = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+        vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+        //sign extending to 64bit
+        vecOutH = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+        IVP_SAN_2X64W_IP(vecOutL, vaOut, pvecOut);
+        IVP_SAN_2X64W_IP(vecOutH, vaOut, pvecOut);
+      }
+      varlen = (maxLoopCount - x);
+      IVP_LANX16_IP(vecInData, vaInData, pvecIn);
+
+      xb_vecNx48 vecIntRes      = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData);
+      xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes));
+      vecOutTempL = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+      vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+      //sign extending to 64bit
+      vecOutL = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+      xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes));
+      vecOutTempH = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+      vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+      vecOutH     = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+      /* store output data */
+      IVP_SAVN_2X64W_XP(vecOutL, vaOut, pvecOut, (varlen << 3));
+      IVP_SAVN_2X64W_XP(vecOutH, vaOut, pvecOut, ((varlen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+      IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+    xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3;
+    xb_vecN_2x64w vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H;
+    xb_vecN_2x32v vecOutTempL, vecOutTempH;
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData3, vaInData, pvecIn);
+
+          xb_vecNx48 vecIntRes0 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecIntRes1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1);
+          xb_vecNx48 vecIntRes2 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2);
+          xb_vecNx48 vecIntRes3 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData3);
+
+          vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+          vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1));
+          vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1));
+          vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2));
+          vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2));
+          vecOut3L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes3), IVP_CVT64SNX48LL(vecIntRes3));
+          vecOut3H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes3), IVP_CVT64SNX48HL(vecIntRes3));
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut0L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut1L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut2L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut2L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut2H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut2H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut3L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut3L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut3H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut3H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+
+          /* Store output data */
+          IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut2L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut2H, vaOut, pvecOut);
+          IVP_SAVN_2X64W_XP(vecOut3L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut3H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData2, vaInData, pvecIn);
+
+          xb_vecNx48 vecIntRes0 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecIntRes1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1);
+          xb_vecNx48 vecIntRes2 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2);
+
+          vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+          vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1));
+          vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1));
+          vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2));
+          vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2));
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut0L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut1L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut2L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut2L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut2H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut2H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+
+          /* Store output data */
+          IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAVN_2X64W_XP(vecOut2L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut2H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+
+          xb_vecNx48 vecIntRes0 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecIntRes1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1);
+
+          vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+          vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1));
+          vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1));
+
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut0L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut1L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+
+          /* Store output data */
+          IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAVN_2X64W_XP(vecOut1L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut1H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+
+          xb_vecNx48 vecIntRes0 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0);
+
+          vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut0L    = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          /* Store output data */
+          IVP_SAVN_2X64W_XP(vecOut0L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut0H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_U16I64 *****************************/
+/* Description : P6 implementation for conversion  U16 to I64             */
+/*               depending on Output Tile type                            */
+/* Inputs      : Input Tile, scale, shift                                 */
+/* Outputs     : XI Error Code                                            */
+/* InOuts      : Output Tile                                              */
+/* Assumptions : InData is unsigned 16bit                                   */
+/**************************************************************************/
+XAI_ERR_TYPE xaiDataConversion3D_U16I64(const xai_pTile3D inTile,
+                                        xai_pTile3D outTile,
+                                        const uint16_t scale,
+                                        const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_U16(inTile);
+    XAI_CHECK_TILE3D_I64(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 24", shift);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  valign vaOut = IVP_ZALIGN();
+
+  /* Get Data Pointers */
+  uint16_t *pInput = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int64_t *pOutput = (int64_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+  const uint32_t rndVal              = (1 << (shift - 1));
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx16U *restrict pvecIn;
+  xb_vecN_2x64w *restrict pvecOut;
+
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile dim1 and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from U16 bit to I64 bit need to done present in contiguous  */
+  /*      memory location. Hence vectorization can be utilized effectively      */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from U16 bit */
+  /*      I64 bit need to done exist in non-contiguous memory location.         */
+  /*      In order to do vectorization across first dimension, output data      */
+  /*      pointers need to be updated based on output tile size and output tile */
+  /*      pitch.                                                                */
+  /******************************************************************************/
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input and output vectors */
+    xb_vecNx16U vecInData;
+
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      xb_vecN_2x32Uv vecOutTempL, vecOutTempH;
+      xb_vecN_2x64w vecOutL, vecOutH;
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx16U *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecN_2x64w *) (pOutput + (z * outTilePitch2));
+      valign vaInData = IVP_LANX16U_PP(pvecIn);
+      int32_t varlen;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* Load input data */
+        IVP_LANX16U_IP(vecInData, vaInData, pvecIn);
+
+        xb_vecNx48 vecIntRes      = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData);
+        xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes));
+        IVP_MULUUAN_2X16X32_0(vecOutIntm1, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+        vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOutIntm1, shift));
+        //sign extending to 64bit
+        vecOutL = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+        xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes));
+        IVP_MULUUAN_2X16X32_0(vecOutIntm2, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+        vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOutIntm2, shift));
+        //sign extending to 64bit
+        vecOutH = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+        IVP_SAN_2X64W_IP(vecOutL, vaOut, pvecOut);
+        IVP_SAN_2X64W_IP(vecOutH, vaOut, pvecOut);
+      }
+      varlen = (maxLoopCount - x);
+      IVP_LANX16_IP(vecInData, vaInData, pvecIn);
+
+      xb_vecNx48 vecIntRes      = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData);
+      xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes));
+      IVP_MULUUAN_2X16X32_0(vecOutIntm1, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+      vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOutIntm1, shift));
+      //sign extending to 64bit
+      vecOutL = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+      xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes));
+      IVP_MULUUAN_2X16X32_0(vecOutIntm2, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+      vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOutIntm2, shift));
+      //sign extending to 64bit
+      vecOutH = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+      /* store output data */
+      IVP_SAVN_2X64W_XP(vecOutL, vaOut, pvecOut, (varlen << 3));
+      IVP_SAVN_2X64W_XP(vecOutH, vaOut, pvecOut, ((varlen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+      IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+    xb_vecNx16U vecInData0, vecInData1, vecInData2, vecInData3;
+    xb_vecN_2x64w vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H;
+    xb_vecN_2x32Uv vecOutTempL, vecOutTempH;
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        uint16_t * pIn = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx16U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX16U_PP(pvecIn);
+          /* load input data */
+          IVP_LANX16U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16U_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16U_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX16U_IP(vecInData3, vaInData, pvecIn);
+
+          xb_vecNx48 vecIntRes0 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecIntRes1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1);
+          xb_vecNx48 vecIntRes2 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2);
+          xb_vecNx48 vecIntRes3 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData3);
+
+          vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+          vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1));
+          vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1));
+          vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2));
+          vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2));
+          vecOut3L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes3), IVP_CVT64SNX48LL(vecIntRes3));
+          vecOut3H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes3), IVP_CVT64SNX48HL(vecIntRes3));
+
+          IVP_MULUUAN_2X16X32_0(vecOut0L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut0L, shift));
+          //sign extending to 64bit
+          vecOut0L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          IVP_MULUUAN_2X16X32_0(vecOut0H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut0H, shift));
+          //sign extending to 64bit
+          vecOut0H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          IVP_MULUUAN_2X16X32_0(vecOut1L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut1L, shift));
+          //sign extending to 64bit
+          vecOut1L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          IVP_MULUUAN_2X16X32_0(vecOut1H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut1H, shift));
+          //sign extending to 64bit
+          vecOut1H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          IVP_MULUUAN_2X16X32_0(vecOut2L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut2L, shift));
+          //sign extending to 64bit
+          vecOut2L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          IVP_MULUUAN_2X16X32_0(vecOut2H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut2H, shift));
+          //sign extending to 64bit
+          vecOut2H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          IVP_MULUUAN_2X16X32_0(vecOut3L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut3L, shift));
+          //sign extending to 64bit
+          vecOut3L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          IVP_MULUUAN_2X16X32_0(vecOut3H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut3H, shift));
+          //sign extending to 64bit
+          vecOut3H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+
+          /* Store output data */
+          IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut2L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut2H, vaOut, pvecOut);
+          IVP_SAVN_2X64W_XP(vecOut3L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut3H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        uint16_t * pIn = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx16U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX16U_PP(pvecIn);
+          /* load input data */
+          IVP_LANX16U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16U_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16U_IP(vecInData2, vaInData, pvecIn);
+
+          xb_vecNx48 vecIntRes0 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecIntRes1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1);
+          xb_vecNx48 vecIntRes2 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2);
+
+          vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+          vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1));
+          vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1));
+          vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2));
+          vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2));
+
+          IVP_MULUUAN_2X16X32_0(vecOut0L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut0L, shift));
+          //sign extending to 64bit
+          vecOut0L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          IVP_MULUUAN_2X16X32_0(vecOut0H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut0H, shift));
+          //sign extending to 64bit
+          vecOut0H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          IVP_MULUUAN_2X16X32_0(vecOut1L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut1L, shift));
+          //sign extending to 64bit
+          vecOut1L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          IVP_MULUUAN_2X16X32_0(vecOut1H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut1H, shift));
+          //sign extending to 64bit
+          vecOut1H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          IVP_MULUUAN_2X16X32_0(vecOut2L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut2L, shift));
+          //sign extending to 64bit
+          vecOut2L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          IVP_MULUUAN_2X16X32_0(vecOut2H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut2H, shift));
+          //sign extending to 64bit
+          vecOut2H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          /* Store output data */
+          IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAVN_2X64W_XP(vecOut2L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut2H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        uint16_t * pIn = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx16U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX16U_PP(pvecIn);
+          /* load input data */
+          IVP_LANX16U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16U_IP(vecInData1, vaInData, pvecIn);
+
+          xb_vecNx48 vecIntRes0 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0);
+          xb_vecNx48 vecIntRes1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1);
+
+          vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+          vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1));
+          vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1));
+
+
+          IVP_MULUUAN_2X16X32_0(vecOut0L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut0L, shift));
+          //sign extending to 64bit
+          vecOut0L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          IVP_MULUUAN_2X16X32_0(vecOut0H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut0H, shift));
+          //sign extending to 64bit
+          vecOut0H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          IVP_MULUUAN_2X16X32_0(vecOut1L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut1L, shift));
+          //sign extending to 64bit
+          vecOut1L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          IVP_MULUUAN_2X16X32_0(vecOut1H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut1H, shift));
+          //sign extending to 64bit
+          vecOut1H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+
+          /* Store output data */
+          IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAVN_2X64W_XP(vecOut1L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut1H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        uint16_t * pIn = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx16U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX16U_PP(pvecIn);
+          /* load input data */
+          IVP_LANX16U_IP(vecInData0, vaInData, pvecIn);
+
+          xb_vecNx48 vecIntRes0 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0);
+
+          vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0));
+          vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0));
+
+          IVP_MULUUAN_2X16X32_0(vecOut0L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut0L, shift));
+          //sign extending to 64bit
+          vecOut0L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+          IVP_MULUUAN_2X16X32_0(vecOut0H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding
+          vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut0H, shift));
+          //sign extending to 64bit
+          vecOut0H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          /* Store output data */
+          IVP_SAVN_2X64W_XP(vecOut0L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut0H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D ****************************************/
+/* Description : General API for DataConversion3D optimized implementation       */
+/*               Calls one of the DataConversion3D functions based               */
+/*               on the parameters                                               */
+/* Inputs      : Input Tile, scale, shift                                        */
+/* Outputs     : XI Error Code                                                   */
+/* InOuts      : Output Tile                                                     */
+/*********************************************************************************/
+XAI_ERR_TYPE xaiDataConversion3D(const xai_pTile3D inTile,
+                                 xai_pTile3D outTile,
+                                 const uint16_t scale,
+                                 const uint8_t shift)
+{
+  if ((!inTile) || (!outTile))
+  {
+    return(XAI_ERR_NULLARG);
+  }
+  if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U16))
+  {
+    if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16))
+    {
+      return(xaiDataConversion3D_U16S16(inTile, outTile, scale, shift));
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U32))
+    {
+      return(xaiDataConversion3D_U16I32(inTile, outTile, scale, shift));
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U64))
+    {
+      return(xaiDataConversion3D_U16I64(inTile, outTile, scale, shift));
+    }
+    else
+    {
+      return(xaiDataConversion3D_U16I8(inTile, outTile, scale, shift));
+    }
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+  {
+    if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16))
+    {
+      return(xaiDataConversion3D_S16(inTile, outTile, scale, shift));
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16))
+    {
+      return(xaiDataConversion3D_S16U16(inTile, outTile, scale, shift));
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U32))
+    {
+      return(xaiDataConversion3D_S16I32(inTile, outTile, scale, shift));
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U64))
+    {
+      return(xaiDataConversion3D_S16I64(inTile, outTile, scale, shift));
+    }
+    else
+    {
+      return(xaiDataConversion3D_S16I8(inTile, outTile, scale, shift));
+    }
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+  {
+    if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U8))
+    {
+      return(xaiDataConversion3D_S8U8(inTile, outTile, scale, shift));
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16))
+    {
+      return(xaiDataConversion3D_S8S16(inTile, outTile, scale, shift));
+    }
+    else if ((XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U32)))
+    {
+      return(xaiDataConversion3D_S8I32(inTile, outTile, scale, shift));
+    }
+    else if ((XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U64)))
+    {
+      return(xaiDataConversion3D_S8I64(inTile, outTile, scale, shift));
+    }
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+  {
+    if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8))
+    {
+      return(xaiDataConversion3D_U8S8(inTile, outTile, scale, shift));
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16))
+    {
+      return(xaiDataConversion3D_U8S16(inTile, outTile, scale, shift));
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16))
+    {
+      return(xaiDataConversion3D_U8U16(inTile, outTile, scale, shift));
+    }
+    else if ((XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U32)))
+    {
+      return(xaiDataConversion3D_U8I32(inTile, outTile, scale, shift));
+    }
+    else if ((XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U64)))
+    {
+      return(xaiDataConversion3D_U8I64(inTile, outTile, scale, shift));
+    }
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S32))
+  {
+    if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8))
+    {
+      return(xaiDataConversion3D_S32S8(inTile, outTile, scale, shift));
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U8))
+    {
+      return(xaiDataConversion3D_S32U8(inTile, outTile, scale, shift));
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16))
+    {
+      return(xaiDataConversion3D_S32S16(inTile, outTile, scale, shift));
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16))
+    {
+      return(xaiDataConversion3D_S32U16(inTile, outTile, scale, shift));
+    }
+  }
+  return(XAI_ERR_NO_VARIANT);
+}
+
+/********************* xaiDataConversion3D_AsymQ_U8S8 ********************/
+/* Description : P6 implementation for conversion from U8_SYM to S8_ASYM */
+/* Inputs      : Input Tile, zeroOut, scale, shift                       */
+/* Outputs     : XI Error Code                                           */
+/* InOuts      : Output Tile                                             */
+/* Assumptions : InData is unsigned 8bit                                 */
+/*************************************************************************/
+XAI_ERR_TYPE xaiDataConversion3D_AsymQ_U8S8(const xai_pTile3D inTile,
+                                            xai_pTile3D outTile,
+                                            const int16_t zeroOut,
+                                            const uint16_t scale,
+                                            const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_U8(inTile);
+    XAI_CHECK_TILE3D_S8(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 24", shift);
+    XAI_CHECK_ERROR((zeroOut >= -128) && (zeroOut < 128), XAI_ERR_NORM, \
+                    "\nzeroOut = %hi, value must be greater than or equal to -128 and less than 128", zeroOut);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  valign vaOut = IVP_ZALIGN();
+
+  /* Get Data Pointers */
+  uint8_t *pInput = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx8U * restrict pvecIn;
+  xb_vecNx8 * restrict pvecOut;
+
+  /* input and output data vectors */
+  xb_vecNx16U vecInData0, vecInData1, vecInData2, vecInData3;
+  xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3;
+
+  int32_t zeroOutParam    = zeroOut;
+  xb_vecNx48 zeroOutShift = IVP_CVT48SNX32((xb_vecN_2x32v) (zeroOutParam << shift), (xb_vecN_2x32v) (zeroOutParam << shift));
+  xb_vecNx16U vecScale    = (xb_vecNx16U) (scale);
+
+  /********************************************************************************/
+  /* The overall design approach is split into 2 parts                            */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch   */
+  /*    is equal to output tile pitch                                             */
+  /*    - If above condition holds good, data elements for which data             */
+  /*      conversion from U8 bit to S8 bit need to done is present in contiguous  */
+  /*      memory location. Hence vectorization can be utilized effectively        */
+  /*                                                                              */
+  /* 2. When input tile pitch is not equal to input tile size or input tile       */
+  /*    pitch is not equal to output tile pitch                                   */
+  /*    - In this scenario, data elements for which data conversion from U8 bit   */
+  /*      S8 bit need to done exist in non-contiguous memory location.            */
+  /*      In order to do vectorization across first dimension, output data        */
+  /*      pointers need to be updated based on output tile size and output tile   */
+  /*      pitch.                                                                  */
+  /********************************************************************************/
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+    /*Input and output vectors*/
+    xb_vecNx16U vecInData;
+    xb_vecNx16 vecOut;
+
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx8U *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecNx8 *) (pOutput + (z * outTilePitch2));
+      valign vaInData = IVP_LANX8U_PP(pvecIn);
+      int32_t varlen;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* Load input data */
+        IVP_LANX8U_IP(vecInData, vaInData, pvecIn);
+
+        /* add zeroOut, apply scale and shift to input data.
+         * To 48bit shifted zeroOut values, add scaled input which is 32 way 48-bit data,
+         * then shift is applied and data is truncated in the 8 bit range
+         * SCHAR_MIN to SCHAR_MAX. So the final result is 32-way, 8-bit.
+         */
+        xb_vecNx48 acc = zeroOutShift;
+        IVP_MULUUANX16(acc, vecInData, vecScale);
+        vecOut = IVP_PACKVRNX48(acc, shift);
+
+        /* store output data */
+        IVP_SANX8S_IP(vecOut, vaOut, pvecOut);
+      }
+      varlen = (maxLoopCount - x);
+      IVP_LANX8U_IP(vecInData, vaInData, pvecIn);
+
+      /* add zeroOut, apply scale and shift to input data.
+       * To 48bit shifted zeroOut values, add scaled input which is 32 way 48-bit data,
+       * then shift is applied and data is truncated in the 8 bit range
+       * SCHAR_MIN to SCHAR_MAX. So the final result is 32-way, 8-bit.
+       */
+      xb_vecNx48 acc = zeroOutShift;
+      IVP_MULUUANX16(acc, vecInData, vecScale);
+      vecOut = IVP_PACKVRNX48(acc, shift);
+
+      /* store output data */
+      IVP_SAVNX8S_XP(vecOut, vaOut, pvecOut, varlen);
+      IVP_SAPOSNX8S_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut   = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData3, vaInData, pvecIn);
+
+          /* add zeroOut, apply scale and shift to input data.
+           * To 48bit shifted zeroOut values, add scaled input which is 32 way 48-bit data,
+           * then shift is applied and data is truncated in the 8 bit range
+           * SCHAR_MIN to SCHAR_MAX. So the final result is 32-way, 8-bit.
+           */
+          xb_vecNx48 acc0, acc1, acc2, acc3;
+          acc0 = zeroOutShift;
+          acc1 = zeroOutShift;
+          acc2 = zeroOutShift;
+          acc3 = zeroOutShift;
+
+          IVP_MULUUANX16(acc0, vecInData0, vecScale);
+          vecOut0 = IVP_PACKVRNX48(acc0, shift);
+
+          IVP_MULUUANX16(acc1, vecInData1, vecScale);
+          vecOut1 = IVP_PACKVRNX48(acc1, shift);
+
+          IVP_MULUUANX16(acc2, vecInData2, vecScale);
+          vecOut2 = IVP_PACKVRNX48(acc2, shift);
+
+          IVP_MULUUANX16(acc3, vecInData3, vecScale);
+          vecOut3 = IVP_PACKVRNX48(acc3, shift);
+
+          /* Store output data */
+          IVP_SANX8S_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX8S_IP(vecOut1, vaOut, pvecOut);
+          IVP_SANX8S_IP(vecOut2, vaOut, pvecOut);
+          IVP_SAVNX8S_XP(vecOut3, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8S_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut   = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData2, vaInData, pvecIn);
+
+          /* add zeroOut, apply scale and shift to input data.
+           * To 48bit shifted zeroOut values, add scaled input which is 32 way 48-bit data,
+           * then shift is applied and data is truncated in the 8 bit range
+           * SCHAR_MIN to SCHAR_MAX. So the final result is 32-way, 8-bit.
+           */
+          xb_vecNx48 acc0, acc1, acc2;
+          acc0 = zeroOutShift;
+          acc1 = zeroOutShift;
+          acc2 = zeroOutShift;
+
+          IVP_MULUUANX16(acc0, vecInData0, vecScale);
+          vecOut0 = IVP_PACKVRNX48(acc0, shift);
+
+          IVP_MULUUANX16(acc1, vecInData1, vecScale);
+          vecOut1 = IVP_PACKVRNX48(acc1, shift);
+
+          IVP_MULUUANX16(acc2, vecInData2, vecScale);
+          vecOut2 = IVP_PACKVRNX48(acc2, shift);
+
+          /* Store output data */
+          IVP_SANX8S_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX8S_IP(vecOut1, vaOut, pvecOut);
+          IVP_SAVNX8S_XP(vecOut2, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8S_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut   = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8U_IP(vecInData1, vaInData, pvecIn);
+
+          /* add zeroOut, apply scale and shift to input data.
+           * To 48bit shifted zeroOut values, add scaled input which is 32 way 48-bit data,
+           * then shift is applied and data is truncated in the 8 bit range
+           * SCHAR_MIN to SCHAR_MAX. So the final result is 32-way, 8-bit.
+           */
+
+          xb_vecNx48 acc0, acc1;
+          acc0 = zeroOutShift;
+          acc1 = zeroOutShift;
+
+          IVP_MULUUANX16(acc0, vecInData0, vecScale);
+          vecOut0 = IVP_PACKVRNX48(acc0, shift);
+
+          IVP_MULUUANX16(acc1, vecInData1, vecScale);
+          vecOut1 = IVP_PACKVRNX48(acc1, shift);
+
+          /* Store output data */
+          IVP_SANX8S_IP(vecOut0, vaOut, pvecOut);
+          IVP_SAVNX8S_XP(vecOut1, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8S_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        uint8_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut   = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX8U_IP(vecInData0, vaInData, pvecIn);
+
+          /* add zeroOut, apply scale and shift to input data.
+           * To 48bit shifted zeroOut values, add scaled input which is 32 way 48-bit data,
+           * then shift is applied and data is truncated in the 8 bit range
+           * SCHAR_MIN to SCHAR_MAX. So the final result is 32-way, 8-bit.
+           */
+          xb_vecNx48 acc0;
+          acc0 = zeroOutShift;
+
+          IVP_MULUUANX16(acc0, vecInData0, vecScale);
+          vecOut0 = IVP_PACKVRNX48(acc0, shift);
+
+          /* Store output data */
+          IVP_SAVNX8S_XP(vecOut0, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8S_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_AsymQ_S16S8 ********************/
+/* Description : P6 implementation for conversion from S16_SYM to S8_ASYM */
+/*               depending on Output Tile type                            */
+/* Inputs      : Input Tile, zeroOut, scale, shift                        */
+/* Outputs     : XI Error Code                                            */
+/* InOuts      : Output Tile                                              */
+/* Assumptions : InData is signed 16bit                                   */
+/**************************************************************************/
+XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S16S8(const xai_pTile3D inTile,
+                                             xai_pTile3D outTile,
+                                             const int16_t zeroOut,
+                                             const uint16_t scale,
+                                             const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S16(inTile);
+    XAI_CHECK_TILE3D_S8(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 32", shift);
+    XAI_CHECK_ERROR((zeroOut >= -128) && (zeroOut < 128), XAI_ERR_NORM, \
+                    "\nzeroOut = %hi, value must be greater than or equal to -128 and less than 128", zeroOut);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  const int16_t minLim = SCHAR_MIN;
+  const int16_t maxLim = SCHAR_MAX;
+
+  /* Get Data Pointers */
+  int16_t *pInput = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  valign vaOut    = IVP_ZALIGN();
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx16 * restrict pvecIn;
+  xb_vecNx8U * restrict pvecOut;
+
+  int64_t zerOutShifted   = (int64_t) zeroOut << shift;
+  xb_vecN_2x32v hvecZeroL = (xb_vecN_2x32v) ((int32_t) (zerOutShifted & 0xFFFFFFFF));
+  xb_vecN_2x32v hvecZeroH = (xb_vecN_2x32v) ((int32_t) ((zerOutShifted >> 32) & 0xFFFFFFFF));
+  xb_vec2Nx8 dvecZeroSh   = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32 \
+                                                   (IVP_SELN_2X32I(hvecZeroH, hvecZeroL, IVP_SELI_32B_INTERLEAVE_1_LO)));
+  xb_vecNx48 zeroOutShift = IVP_CVT48UN_2X64L(dvecZeroSh, dvecZeroSh);
+  IVP_CVT48UN_2X64H(zeroOutShift, dvecZeroSh, dvecZeroSh);
+
+  xb_vecNx16U vecScale = (xb_vecNx16U) (scale);
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from signed 16 bit to S8/U8 bit need to done present in    */
+  /*      in contiguous memory location. Hence vectorization can be utilized    */
+  /*      effectively                                                  */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from signed */
+  /*      16 bit to S8/U8 bit need to done exist in non-contiguous memory       */
+  /*      location. In order to do vectorization across first dimension, */
+  /*      output data pointers need to be updated based on output tile size     */
+  /*      and output tile pitch                                                 */
+  /******************************************************************************/
+
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input data vectors */
+    xb_vecNx16 vecInData;
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx16 *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecNx8U *) (pOutput + (z * outTilePitch2));
+
+      valign vaInData = IVP_LANX16_PP(pvecIn);
+      xb_vecNx16 vecOut;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* load input data */
+        IVP_LANX16_IP(vecInData, vaInData, pvecIn);
+
+        /* apply scale and shift to input data.
+         * multiplying with scale results in 32 way 48-bit
+         * data to which shift is applied, so final result is
+         * 32 way 16 bit.
+         */
+        xb_vecNx48 acc = zeroOutShift;
+        IVP_MULUSANX16(acc, vecScale, vecInData);
+        vecOut = IVP_PACKVRNX48(acc, shift);
+        vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+        /* store output data */
+        IVP_SANX8U_IP(vecOut, vaOut, pvecOut);
+      }
+      /* load input data */
+      IVP_LANX16_IP(vecInData, vaInData, pvecIn);
+
+      /* apply scale and shift to input data.
+       * multiplying with scale results in 32 way 48-bit
+       * data to which shift is applied, so final result is
+       * 32 way 16 bit.
+       */
+      xb_vecNx48 acc = zeroOutShift;
+      IVP_MULUSANX16(acc, vecScale, vecInData);
+      vecOut = IVP_PACKVRNX48(acc, shift);
+      vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+      /* store output data */
+      IVP_SAVNX8U_XP(vecOut, vaOut, pvecOut, (maxLoopCount - x));
+      IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut   = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3;
+          xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3;
+
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData3, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          xb_vecNx48 acc0, acc1, acc2, acc3;
+          acc0 = zeroOutShift;
+          acc1 = zeroOutShift;
+          acc2 = zeroOutShift;
+          acc3 = zeroOutShift;
+
+          IVP_MULUSANX16(acc0, vecScale, vecInData0);
+          vecOut0 = IVP_PACKVRNX48(acc0, shift);
+          vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          IVP_MULUSANX16(acc1, vecScale, vecInData1);
+          vecOut1 = IVP_PACKVRNX48(acc1, shift);
+          vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          IVP_MULUSANX16(acc2, vecScale, vecInData2);
+          vecOut2 = IVP_PACKVRNX48(acc2, shift);
+          vecOut2 = IVP_MAXNX16(IVP_MINNX16(vecOut2, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          IVP_MULUSANX16(acc3, vecScale, vecInData3);
+          vecOut3 = IVP_PACKVRNX48(acc3, shift);
+          vecOut3 = IVP_MAXNX16(IVP_MINNX16(vecOut3, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          /* Store output data */
+          IVP_SANX8U_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX8U_IP(vecOut1, vaOut, pvecOut);
+          IVP_SANX8U_IP(vecOut2, vaOut, pvecOut);
+          IVP_SAVNX8U_XP(vecOut3, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut   = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0, vecInData1, vecInData2;
+          xb_vecNx16 vecOut0, vecOut1, vecOut2;
+
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData2, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          xb_vecNx48 acc0, acc1, acc2;
+          acc0 = zeroOutShift;
+          acc1 = zeroOutShift;
+          acc2 = zeroOutShift;
+
+          IVP_MULUSANX16(acc0, vecScale, vecInData0);
+          vecOut0 = IVP_PACKVRNX48(acc0, shift);
+          vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          IVP_MULUSANX16(acc1, vecScale, vecInData1);
+          vecOut1 = IVP_PACKVRNX48(acc1, shift);
+          vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          IVP_MULUSANX16(acc2, vecScale, vecInData2);
+          vecOut2 = IVP_PACKVRNX48(acc2, shift);
+          vecOut2 = IVP_MAXNX16(IVP_MINNX16(vecOut2, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          /* Store output data */
+          IVP_SANX8U_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX8U_IP(vecOut1, vaOut, pvecOut);
+          IVP_SAVNX8U_XP(vecOut2, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut   = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0, vecInData1;
+          xb_vecNx16 vecOut0, vecOut1;
+
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          xb_vecNx48 acc0, acc1;
+          acc0 = zeroOutShift;
+          acc1 = zeroOutShift;
+
+          IVP_MULUSANX16(acc0, vecScale, vecInData0);
+          vecOut0 = IVP_PACKVRNX48(acc0, shift);
+          vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          IVP_MULUSANX16(acc1, vecScale, vecInData1);
+          vecOut1 = IVP_PACKVRNX48(acc1, shift);
+          vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          /* Store output data */
+          IVP_SANX8U_IP(vecOut0, vaOut, pvecOut);
+          IVP_SAVNX8U_XP(vecOut1, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        int16_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut   = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16 vecInData0;
+          xb_vecNx16 vecOut0;
+
+          pvecIn  = (xb_vecNx16 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16_IP(vecInData0, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          xb_vecNx48 acc0 = zeroOutShift;
+
+          IVP_MULUSANX16(acc0, vecScale, vecInData0);
+          vecOut0 = IVP_PACKVRNX48(acc0, shift);
+          vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          /* Store output data */
+          IVP_SAVNX8U_XP(vecOut0, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************** xaiDataConversion3D_AsymQ_U16S8 *******************/
+/* Description : P6 implementation for conversion from U16_SYM to S8_ASYM */
+/*               depending on Output Tile type                            */
+/* Inputs      : Input Tile, zeroOut, scale, shift                        */
+/* Outputs     : XI Error Code                                            */
+/* InOuts      : Output Tile                                              */
+/* Assumptions : InData is unsigned 16bit                                 */
+/**************************************************************************/
+XAI_ERR_TYPE xaiDataConversion3D_AsymQ_U16S8(const xai_pTile3D inTile,
+                                             xai_pTile3D outTile,
+                                             const int16_t zeroOut,
+                                             const uint16_t scale,
+                                             const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_U16(inTile);
+    XAI_CHECK_TILE3D_S8(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 32", shift);
+    XAI_CHECK_ERROR((zeroOut >= -128) && (zeroOut < 128), XAI_ERR_NORM, \
+                    "\nzeroOut = %hi, value must be greater than or equal to -128 and less than 128", zeroOut);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  const int16_t minLim = SCHAR_MIN;
+  const int16_t maxLim = SCHAR_MAX;
+
+  /* Get Data Pointers */
+  uint16_t *pInput = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutput  = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  valign vaOut     = IVP_ZALIGN();
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx16U * restrict pvecIn;
+  xb_vecNx8U * restrict pvecOut;
+
+  int64_t zerOutShifted   = (int64_t) zeroOut << shift;
+  xb_vecN_2x32v hvecZeroL = (xb_vecN_2x32v) ((int32_t) (zerOutShifted & 0xFFFFFFFF));
+  xb_vecN_2x32v hvecZeroH = (xb_vecN_2x32v) ((int32_t) ((zerOutShifted >> 32) & 0xFFFFFFFF));
+  xb_vec2Nx8 dvecZeroSh   = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32 \
+                                                   (IVP_SELN_2X32I(hvecZeroH, hvecZeroL, IVP_SELI_32B_INTERLEAVE_1_LO)));
+  xb_vecNx48 zeroOutShift = IVP_CVT48UN_2X64L(dvecZeroSh, dvecZeroSh);
+  IVP_CVT48UN_2X64H(zeroOutShift, dvecZeroSh, dvecZeroSh);
+
+  xb_vecNx16U vecScale = (xb_vecNx16U) (scale);
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from signed 16 bit to S8/U8 bit need to done present in    */
+  /*      in contiguous memory location. Hence vectorization can be utilized    */
+  /*      effectively                                                  */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from signed */
+  /*      16 bit to S8/U8 bit need to done exist in non-contiguous memory       */
+  /*      location. In order to do vectorization across first dimension, */
+  /*      output data pointers need to be updated based on output tile size     */
+  /*      and output tile pitch                                                 */
+  /******************************************************************************/
+
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input data vectors */
+    xb_vecNx16U vecInData;
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx16U *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecNx8U *) (pOutput + (z * outTilePitch2));
+
+      valign vaInData = IVP_LANX16U_PP(pvecIn);
+      xb_vecNx16 vecOut;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* load input data */
+        IVP_LANX16U_IP(vecInData, vaInData, pvecIn);
+
+        /* apply scale and shift to input data.
+         * multiplying with scale results in 32 way 48-bit
+         * data to which shift is applied, so final result is
+         * 32 way 16 bit.
+         */
+        xb_vecNx48 acc = zeroOutShift;
+        IVP_MULUUANX16(acc, vecScale, vecInData);
+        vecOut = IVP_PACKVRNX48(acc, shift);
+        vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+        /* store output data */
+        IVP_SANX8U_IP(vecOut, vaOut, pvecOut);
+      }
+      /* load input data */
+      IVP_LANX16U_IP(vecInData, vaInData, pvecIn);
+
+      /* apply scale and shift to input data.
+       * multiplying with scale results in 32 way 48-bit
+       * data to which shift is applied, so final result is
+       * 32 way 16 bit.
+       */
+      xb_vecNx48 acc = zeroOutShift;
+      IVP_MULUUANX16(acc, vecScale, vecInData);
+      vecOut = IVP_PACKVRNX48(acc, shift);
+      vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+      /* store output data */
+      IVP_SAVNX8U_XP(vecOut, vaOut, pvecOut, (maxLoopCount - x));
+      IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        uint16_t * pIn = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut   = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16U vecInData0, vecInData1, vecInData2, vecInData3;
+          xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3;
+
+          pvecIn  = (xb_vecNx16U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16U_PP(pvecIn);
+          /* load input data */
+          IVP_LANX16U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16U_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16U_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX16U_IP(vecInData3, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          xb_vecNx48 acc0, acc1, acc2, acc3;
+          acc0 = zeroOutShift;
+          acc1 = zeroOutShift;
+          acc2 = zeroOutShift;
+          acc3 = zeroOutShift;
+
+          IVP_MULUUANX16(acc0, vecScale, vecInData0);
+          vecOut0 = IVP_PACKVRNX48(acc0, shift);
+          vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          IVP_MULUUANX16(acc1, vecScale, vecInData1);
+          vecOut1 = IVP_PACKVRNX48(acc1, shift);
+          vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          IVP_MULUUANX16(acc2, vecScale, vecInData2);
+          vecOut2 = IVP_PACKVRNX48(acc2, shift);
+          vecOut2 = IVP_MAXNX16(IVP_MINNX16(vecOut2, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          IVP_MULUUANX16(acc3, vecScale, vecInData3);
+          vecOut3 = IVP_PACKVRNX48(acc3, shift);
+          vecOut3 = IVP_MAXNX16(IVP_MINNX16(vecOut3, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          /* Store output data */
+          IVP_SANX8U_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX8U_IP(vecOut1, vaOut, pvecOut);
+          IVP_SANX8U_IP(vecOut2, vaOut, pvecOut);
+          IVP_SAVNX8U_XP(vecOut3, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        uint16_t * pIn = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut   = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16U vecInData0, vecInData1, vecInData2;
+          xb_vecNx16 vecOut0, vecOut1, vecOut2;
+
+          pvecIn  = (xb_vecNx16U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16U_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX16U_IP(vecInData2, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          xb_vecNx48 acc0, acc1, acc2;
+          acc0 = zeroOutShift;
+          acc1 = zeroOutShift;
+          acc2 = zeroOutShift;
+
+          IVP_MULUUANX16(acc0, vecScale, vecInData0);
+          vecOut0 = IVP_PACKVRNX48(acc0, shift);
+          vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          IVP_MULUUANX16(acc1, vecScale, vecInData1);
+          vecOut1 = IVP_PACKVRNX48(acc1, shift);
+          vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          IVP_MULUUANX16(acc2, vecScale, vecInData2);
+          vecOut2 = IVP_PACKVRNX48(acc2, shift);
+          vecOut2 = IVP_MAXNX16(IVP_MINNX16(vecOut2, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          /* Store output data */
+          IVP_SANX8U_IP(vecOut0, vaOut, pvecOut);
+          IVP_SANX8U_IP(vecOut1, vaOut, pvecOut);
+          IVP_SAVNX8U_XP(vecOut2, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        uint16_t * pIn = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut   = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16U vecInData0, vecInData1;
+          xb_vecNx16 vecOut0, vecOut1;
+
+          pvecIn  = (xb_vecNx16U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16U_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX16_IP(vecInData1, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          xb_vecNx48 acc0, acc1;
+          acc0 = zeroOutShift;
+          acc1 = zeroOutShift;
+
+          IVP_MULUUANX16(acc0, vecScale, vecInData0);
+          vecOut0 = IVP_PACKVRNX48(acc0, shift);
+          vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          IVP_MULUUANX16(acc1, vecScale, vecInData1);
+          vecOut1 = IVP_PACKVRNX48(acc1, shift);
+          vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          /* Store output data */
+          IVP_SANX8U_IP(vecOut0, vaOut, pvecOut);
+          IVP_SAVNX8U_XP(vecOut1, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        uint16_t * pIn = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut   = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecNx16U vecInData0;
+          xb_vecNx16 vecOut0;
+
+          pvecIn  = (xb_vecNx16U *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX16U_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX16U_IP(vecInData0, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          xb_vecNx48 acc0 = zeroOutShift;
+
+          IVP_MULUUANX16(acc0, vecScale, vecInData0);
+          vecOut0 = IVP_PACKVRNX48(acc0, shift);
+          vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+          /* Store output data */
+          IVP_SAVNX8U_XP(vecOut0, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8U_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+// Temporary wrapper, to be removed later
+XAI_ERR_TYPE xaiDataConversion3D_U16AS8(const xai_pTile3D inTile,
+                                        xai_pTile3D outTile,
+                                        const int16_t zeroOut,
+                                        const uint16_t scale,
+                                        const uint8_t shift)
+{
+  return(xaiDataConversion3D_AsymQ_U16S8(inTile, outTile, zeroOut, scale, shift));
+}
+
+/********************* xaiDataConversion3D_AsymQ_S32S8 ********************/
+/* Description : P6 implementation for conversion from S32_SYM to S8_ASYM */
+/*               depending on Output Tile type                            */
+/* Inputs      : Input Tile, zeroOut, scale, shift                        */
+/* Outputs     : XI Error Code                                            */
+/* InOuts      : Output Tile                                              */
+/* Assumptions : InData is signed 32bit                                   */
+/**************************************************************************/
+XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S32S8(const xai_pTile3D inTile,
+                                             xai_pTile3D outTile,
+                                             const int16_t zeroOut,
+                                             const uint16_t scale,
+                                             const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S32(inTile);
+    XAI_CHECK_TILE3D_S8(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 32", shift);
+    XAI_CHECK_ERROR((zeroOut >= -128) && (zeroOut < 128), XAI_ERR_NORM, \
+                    "\nzeroOut = %hi, value must be greater than or equal to -128 and less than 128", zeroOut);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  const int16_t minLim = SCHAR_MIN;
+  const int16_t maxLim = SCHAR_MAX;
+
+  /* Get Data Pointers */
+  int32_t *pInput = (int32_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  valign vaOut = IVP_ZALIGN();
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH / 2;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecN_2x32v * restrict pvecIn;
+  xb_vecNx8 * restrict pvecOut;
+
+  xb_vecN_2x64w vec0scaledIn64B, vec1scaledIn64B;
+
+  /* SCALE*/
+  xb_vecNx16U vecScale = (xb_vecNx16U) (scale);
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile width and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from signed 32 bit to S8/U8 bit need to done present in    */
+  /*      in contiguous memory location. Hence vectorization can be utilized    */
+  /*      effectively                                                  */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from signed */
+  /*      32 bit to S8/U8 bit need to done exist in non-contiguous memory       */
+  /*      location. In order to do vectorization across first dimension, */
+  /*      output data pointers need to be updated based on output tile size     */
+  /*      and output tile pitch                                                 */
+  /******************************************************************************/
+
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input data vectors */
+    xb_vecN_2x32v vecInData0, vecInData1;
+
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecN_2x32v *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecNx8 *) (pOutput + (z * outTilePitch2));
+
+      valign vaInData = IVP_LAN_2X32_PP(pvecIn);
+      xb_vecNx16 vecOut, vecOut0, vecOut1;
+      x = 0;
+      for (; x < maxLoopCount - vectorizationWidth2X; x += vectorizationWidth2X)
+      {
+        /* Load input data */
+        IVP_LAN_2X32_IP(vecInData0, vaInData, pvecIn);
+        IVP_LAN_2X32_IP(vecInData1, vaInData, pvecIn);
+
+        /* Initialize the 64-bit wide vector with (zeroOut << shift)*/
+        vec0scaledIn64B = IVP_MULSUN_2X16X32_0(zeroOut, (1 << shift));
+        vec1scaledIn64B = IVP_MULSUN_2X16X32_0(zeroOut, (1 << shift));
+
+        /* Multiply U16 scale with S32 input and ACCUMULATE in 64-bit wide vector */
+        IVP_MULUSAN_2X16X32_0(vec0scaledIn64B, vecScale, vecInData0);
+        IVP_MULUSAN_2X16X32_0(vec1scaledIn64B, vecScale, vecInData1);
+
+        /* Pack the 64-bit wide vector in 32 bit vecotr, by applying shift */
+        xb_vecN_2x32v vec0scaledIn32B = IVP_PACKVRN_2X64W(vec0scaledIn64B, shift);
+        xb_vecN_2x32v vec1scaledIn32B = IVP_PACKVRN_2X64W(vec1scaledIn64B, shift);
+
+        /* CLAMP the 32bit scaled-shift data to minLim and maxLim, & store it
+         * in 16-bit vector, whose odd lanes (1, 3, 5...) are zeroes.*/
+        vecOut0 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec0scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim));
+        vecOut1 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec1scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim));
+
+        /* Select the actual data present at even lanes, i.e. 0, 2, 4,...  */
+        vecOut = IVP_SELNX16I(vecOut1, vecOut0, IVP_SELI_EXTRACT_1_OF_2_OFF_0);
+
+        /* Store output data */
+        IVP_SANX8S_IP(vecOut, vaOut, pvecOut);
+      }
+
+      /* Load remaining input data */
+      IVP_LAVN_2X32_XP(vecInData0, vaInData, pvecIn, (maxLoopCount - x) * 4);
+      IVP_LAVN_2X32_XP(vecInData1, vaInData, pvecIn, ((maxLoopCount - x) - (vectorizationWidth >> 1)) * 4);
+
+      /* Initialize the 64-bit wide vector with (zeroOut << shift)*/
+      vec0scaledIn64B = IVP_MULSUN_2X16X32_0(zeroOut, (1 << shift));
+      vec1scaledIn64B = IVP_MULSUN_2X16X32_0(zeroOut, (1 << shift));
+
+      /* Multiply U16 scale with S32 input and ACCUMULATE in 64-bit wide vector */
+      IVP_MULUSAN_2X16X32_0(vec0scaledIn64B, vecScale, vecInData0);
+      IVP_MULUSAN_2X16X32_0(vec1scaledIn64B, vecScale, vecInData1);
+
+      /* Pack the 64-bit wide vector in 32 bit vecotr, by applying shift */
+      xb_vecN_2x32v vec0scaledIn32B = IVP_PACKVRN_2X64W(vec0scaledIn64B, shift);
+      xb_vecN_2x32v vec1scaledIn32B = IVP_PACKVRN_2X64W(vec1scaledIn64B, shift);
+
+      /* CLAMP the 32bit scaled-shift data to minLim and maxLim, & store it
+       * in 16-bit vector, whose odd lanes (1, 3, 5...) are zeroes.*/
+      vecOut0 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec0scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim));
+      vecOut1 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec1scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim));
+
+      /* Select the actual data present at even lanes, i.e. 0, 2, 4,...  */
+      vecOut = IVP_SELNX16I(vecOut1, vecOut0, IVP_SELI_EXTRACT_1_OF_2_OFF_0);
+
+      /* Store output data */
+      IVP_SAVNX8S_XP(vecOut, vaOut, pvecOut, (maxLoopCount - x));
+      IVP_SAPOSNX8S_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      for (; x < dim1Size; x += vectorizationWidth2X)
+      {
+        /* Initialize input and output data pointer */
+        int32_t * pIn  = &pInput[z * inTilePitch2 + x];
+        int8_t *pOut   = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          /* input and output data vectors */
+          xb_vecN_2x32v vecInData0, vecInData1;
+          xb_vecNx16 vecOut0, vecOut1, vecOut;
+
+          pvecIn  = (xb_vecN_2x32v *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1));
+
+          /* Load input data */
+          valign vaInData = IVP_LAN_2X32_PP(pvecIn);
+          IVP_LAVN_2X32_XP(vecInData0, vaInData, pvecIn, varLen * 4);
+          IVP_LAVN_2X32_XP(vecInData1, vaInData, pvecIn, (varLen - (vectorizationWidth >> 1)) * 4);
+
+          /* Initialize the 64-bit wide vector with (zeroOut << shift)*/
+          vec0scaledIn64B = IVP_MULSUN_2X16X32_0(zeroOut, (1 << shift));
+          vec1scaledIn64B = IVP_MULSUN_2X16X32_0(zeroOut, (1 << shift));
+
+          /* Multiply U16 scale with S32 input and ACCUMULATE in 64-bit wide vector */
+          IVP_MULUSAN_2X16X32_0(vec0scaledIn64B, vecScale, vecInData0);
+          IVP_MULUSAN_2X16X32_0(vec1scaledIn64B, vecScale, vecInData1);
+
+          /* Pack the 64-bit wide vector in 32 bit vecotr, by applying shift */
+          xb_vecN_2x32v vec0scaledIn32B = IVP_PACKVRN_2X64W(vec0scaledIn64B, shift);
+          xb_vecN_2x32v vec1scaledIn32B = IVP_PACKVRN_2X64W(vec1scaledIn64B, shift);
+
+          /* CLAMP the 32bit scaled-shift data to minLim and maxLim, & store it
+           * in 16-bit vector, whose odd lanes (1, 3, 5...) are zeroes.*/
+          vecOut0 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec0scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim));
+          vecOut1 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec1scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim));
+
+          /* Select the actual data present at even lanes, i.e. 0, 2, 4,...  */
+          vecOut = IVP_SELNX16I(vecOut1, vecOut0, IVP_SELI_EXTRACT_1_OF_2_OFF_0);
+
+          /* Store output data */
+          IVP_SAVNX8S_XP(vecOut, vaOut, pvecOut, varLen);
+          IVP_SAPOSNX8S_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_AsymQ_S8I32 ********************/
+/* Description : P6 implementation for conversion from S8_ASYM to I32_SYM */
+/* Inputs      : Input Tile, zeroIn, scale, shift                         */
+/* Outputs     : XI Error Code                                            */
+/* InOuts      : Output Tile                                              */
+/* Assumptions : InData is signed 8bit                                    */
+/**************************************************************************/
+XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8I32(const xai_pTile3D inTile,
+                                             xai_pTile3D outTile,
+                                             const int16_t zeroIn,
+                                             const uint16_t scale,
+                                             const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_TILE3D_I32(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 24", shift);
+    XAI_CHECK_ERROR((zeroIn >= -128) && (zeroIn < 128), XAI_ERR_NORM, \
+                    "\nzeroIn = %hi, value must be greater than or equal to -128 and less than 128", zeroIn);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  valign vaOut = IVP_ZALIGN();
+
+  /* Get Data Pointers */
+  int8_t *pInput   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int32_t *pOutput = (int32_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+
+  const int32_t minLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32)) ? INT_MIN : 0;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx8 * restrict pvecIn;
+  xb_vecN_2x32v * restrict pvecOut;
+
+  /* input and output data vectors */
+  xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3;
+  xb_vecN_2x32v vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H;
+
+  xb_vecNx16U vecScale   = (xb_vecNx16U) (scale);
+  xb_vecNx16 vecZeroIn   = (xb_vecNx16) (-zeroIn);
+  xb_vecNx48 zeroInScale = IVP_MULUSNX16(vecScale, vecZeroIn);
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile dim1 and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from S8 bit to S16 bit need to done present in contiguous  */
+  /*      memory location. Hence vectorization can be utilized effectively      */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from S8 bit */
+  /*      S16 bit need to done exist in non-contiguous memory location.         */
+  /*      In order to do vectorization across first dimension, output data      */
+  /*      pointers need to be updated based on output tile size and output tile */
+  /*      pitch.                                                                */
+  /******************************************************************************/
+
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input and output vectors */
+    xb_vecNx16 vecInData;
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx8 *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecN_2x32v *) (pOutput + (z * outTilePitch2));
+      valign vaInData = IVP_LANX8S_PP(pvecIn);
+      int32_t varlen;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* Load input data */
+        IVP_LANX8S_IP(vecInData, vaInData, pvecIn);
+
+        xb_vecNx48 acc = zeroInScale;
+        IVP_MULUSANX16(acc, vecScale, vecInData);
+        //vecOut = IVP_PACKVRNX48(acc, shift);
+        xb_vecN_2x32v vecIntResL = IVP_CVT32SNX48L(acc);
+        xb_vecN_2x32v vecIntResH = IVP_CVT32SNX48H(acc);
+        vecIntResL = IVP_ADDN_2X32(vecIntResL, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+        vecIntResH = IVP_ADDN_2X32(vecIntResH, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+        vecIntResL = IVP_SRAN_2X32(vecIntResL, (xb_vecN_2x32v) (shift));
+        vecIntResH = IVP_SRAN_2X32(vecIntResH, (xb_vecN_2x32v) (shift));
+        vecOut0L   = IVP_MAXN_2X32(vecIntResL, (xb_vecN_2x32v) minLim);
+        vecOut0H   = IVP_MAXN_2X32(vecIntResH, (xb_vecN_2x32v) minLim);
+
+
+        /* store output data */
+        IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut);
+        IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut);
+      }
+      varlen = (maxLoopCount - x);
+      IVP_LANX8S_IP(vecInData, vaInData, pvecIn);
+
+      xb_vecNx48 acc = zeroInScale;
+      IVP_MULUSANX16(acc, vecScale, vecInData);
+      xb_vecN_2x32v vecIntResL = IVP_CVT32SNX48L(acc);
+      xb_vecN_2x32v vecIntResH = IVP_CVT32SNX48H(acc);
+
+      vecIntResL = IVP_ADDN_2X32(vecIntResL, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+      vecIntResH = IVP_ADDN_2X32(vecIntResH, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+      vecIntResL = IVP_SRAN_2X32(vecIntResL, (xb_vecN_2x32v) (shift));
+      vecIntResH = IVP_SRAN_2X32(vecIntResH, (xb_vecN_2x32v) (shift));
+      vecOut0L   = IVP_MAXN_2X32(vecIntResL, (xb_vecN_2x32v) minLim);
+      vecOut0H   = IVP_MAXN_2X32(vecIntResH, (xb_vecN_2x32v) minLim);
+
+      /* store output data */
+      IVP_SAVN_2X32_XP(vecOut0L, vaOut, pvecOut, (varlen << 2));
+      IVP_SAVN_2X32_XP(vecOut0H, vaOut, pvecOut, ((varlen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)));
+      IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        int32_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          /* load input data */
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData3, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          xb_vecNx48 acc0, acc1, acc2, acc3;
+          acc0 = zeroInScale;
+          acc1 = zeroInScale;
+          acc2 = zeroInScale;
+          acc3 = zeroInScale;
+
+          IVP_MULUSANX16(acc0, vecScale, vecInData0);
+          IVP_MULUSANX16(acc1, vecScale, vecInData1);
+          IVP_MULUSANX16(acc2, vecScale, vecInData2);
+          IVP_MULUSANX16(acc3, vecScale, vecInData3);
+
+          xb_vecN_2x32v vecIntRes0L = IVP_CVT32SNX48L(acc0);
+          xb_vecN_2x32v vecIntRes0H = IVP_CVT32SNX48H(acc0);
+          xb_vecN_2x32v vecIntRes1L = IVP_CVT32SNX48L(acc1);
+          xb_vecN_2x32v vecIntRes1H = IVP_CVT32SNX48H(acc1);
+          xb_vecN_2x32v vecIntRes2L = IVP_CVT32SNX48L(acc2);
+          xb_vecN_2x32v vecIntRes2H = IVP_CVT32SNX48H(acc2);
+          xb_vecN_2x32v vecIntRes3L = IVP_CVT32SNX48L(acc3);
+          xb_vecN_2x32v vecIntRes3H = IVP_CVT32SNX48H(acc3);
+
+          vecIntRes0L = IVP_ADDN_2X32(vecIntRes0L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecIntRes0H = IVP_ADDN_2X32(vecIntRes0H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecOut0L    = IVP_SRAN_2X32(vecIntRes0L, (xb_vecN_2x32v) (shift));
+          vecOut0H    = IVP_SRAN_2X32(vecIntRes0H, (xb_vecN_2x32v) (shift));
+          vecOut0L    = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim);
+          vecOut0H    = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim);
+
+
+          vecIntRes1L = IVP_ADDN_2X32(vecIntRes1L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecIntRes1H = IVP_ADDN_2X32(vecIntRes1H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecOut1L    = IVP_SRAN_2X32(vecIntRes1L, (xb_vecN_2x32v) (shift));
+          vecOut1H    = IVP_SRAN_2X32(vecIntRes1H, (xb_vecN_2x32v) (shift));
+          vecOut1L    = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim);
+          vecOut1H    = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim);
+
+
+          vecIntRes2L = IVP_ADDN_2X32(vecIntRes2L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecIntRes2H = IVP_ADDN_2X32(vecIntRes2H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecOut2L    = IVP_SRAN_2X32(vecIntRes2L, (xb_vecN_2x32v) (shift));
+          vecOut2H    = IVP_SRAN_2X32(vecIntRes2H, (xb_vecN_2x32v) (shift));
+          vecOut2L    = IVP_MAXN_2X32(vecOut2L, (xb_vecN_2x32v) minLim);
+          vecOut2H    = IVP_MAXN_2X32(vecOut2H, (xb_vecN_2x32v) minLim);
+
+
+          vecIntRes3L = IVP_ADDN_2X32(vecIntRes3L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecIntRes3H = IVP_ADDN_2X32(vecIntRes3H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecOut3L    = IVP_SRAN_2X32(vecIntRes3L, (xb_vecN_2x32v) (shift));
+          vecOut3H    = IVP_SRAN_2X32(vecIntRes3H, (xb_vecN_2x32v) (shift));
+          vecOut3L    = IVP_MAXN_2X32(vecOut3L, (xb_vecN_2x32v) minLim);
+          vecOut3H    = IVP_MAXN_2X32(vecOut3H, (xb_vecN_2x32v) minLim);
+
+
+          /* Store output data */
+          IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut2L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut2H, vaOut, pvecOut);
+
+          IVP_SAVN_2X32_XP(vecOut3L, vaOut, pvecOut, (varLen << 2));
+          IVP_SAVN_2X32_XP(vecOut3H, vaOut, pvecOut, ((varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)));
+          IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        int32_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          /* load input data */
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData2, vaInData, pvecIn);
+
+          xb_vecNx48 acc0, acc1, acc2;
+          acc0 = zeroInScale;
+          acc1 = zeroInScale;
+          acc2 = zeroInScale;
+
+          IVP_MULUSANX16(acc0, vecScale, vecInData0);
+          IVP_MULUSANX16(acc1, vecScale, vecInData1);
+          IVP_MULUSANX16(acc2, vecScale, vecInData2);
+
+          xb_vecN_2x32v vecIntRes0L = IVP_CVT32SNX48L(acc0);
+          xb_vecN_2x32v vecIntRes0H = IVP_CVT32SNX48H(acc0);
+          xb_vecN_2x32v vecIntRes1L = IVP_CVT32SNX48L(acc1);
+          xb_vecN_2x32v vecIntRes1H = IVP_CVT32SNX48H(acc1);
+          xb_vecN_2x32v vecIntRes2L = IVP_CVT32SNX48L(acc2);
+          xb_vecN_2x32v vecIntRes2H = IVP_CVT32SNX48H(acc2);
+
+
+          vecIntRes0L = IVP_ADDN_2X32(vecIntRes0L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecIntRes0H = IVP_ADDN_2X32(vecIntRes0H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecOut0L    = IVP_SRAN_2X32(vecIntRes0L, (xb_vecN_2x32v) (shift));
+          vecOut0H    = IVP_SRAN_2X32(vecIntRes0H, (xb_vecN_2x32v) (shift));
+          vecOut0L    = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim);
+          vecOut0H    = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim);
+
+
+          vecIntRes1L = IVP_ADDN_2X32(vecIntRes1L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecIntRes1H = IVP_ADDN_2X32(vecIntRes1H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecOut1L    = IVP_SRAN_2X32(vecIntRes1L, (xb_vecN_2x32v) (shift));
+          vecOut1H    = IVP_SRAN_2X32(vecIntRes1H, (xb_vecN_2x32v) (shift));
+          vecOut1L    = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim);
+          vecOut1H    = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim);
+
+
+          vecIntRes2L = IVP_ADDN_2X32(vecIntRes2L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecIntRes2H = IVP_ADDN_2X32(vecIntRes2H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecOut2L    = IVP_SRAN_2X32(vecIntRes2L, (xb_vecN_2x32v) (shift));
+          vecOut2H    = IVP_SRAN_2X32(vecIntRes2H, (xb_vecN_2x32v) (shift));
+          vecOut2L    = IVP_MAXN_2X32(vecOut2L, (xb_vecN_2x32v) minLim);
+          vecOut2H    = IVP_MAXN_2X32(vecOut2H, (xb_vecN_2x32v) minLim);
+
+
+
+          /* Store output data */
+          IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAVN_2X32_XP(vecOut2L, vaOut, pvecOut, (varLen << 2));
+          IVP_SAVN_2X32_XP(vecOut2H, vaOut, pvecOut, ((varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)));
+          IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        int32_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+
+          /* load input data */
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          xb_vecNx48 acc0, acc1;
+          acc0 = zeroInScale;
+          acc1 = zeroInScale;
+
+          IVP_MULUSANX16(acc0, vecScale, vecInData0);
+          IVP_MULUSANX16(acc1, vecScale, vecInData1);
+
+          xb_vecN_2x32v vecIntRes0L = IVP_CVT32SNX48L(acc0);
+          xb_vecN_2x32v vecIntRes0H = IVP_CVT32SNX48H(acc0);
+          xb_vecN_2x32v vecIntRes1L = IVP_CVT32SNX48L(acc1);
+          xb_vecN_2x32v vecIntRes1H = IVP_CVT32SNX48H(acc1);
+
+
+
+          vecIntRes0L = IVP_ADDN_2X32(vecIntRes0L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecIntRes0H = IVP_ADDN_2X32(vecIntRes0H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecOut0L    = IVP_SRAN_2X32(vecIntRes0L, (xb_vecN_2x32v) (shift));
+          vecOut0H    = IVP_SRAN_2X32(vecIntRes0H, (xb_vecN_2x32v) (shift));
+          vecOut0L    = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim);
+          vecOut0H    = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim);
+
+
+          vecIntRes1L = IVP_ADDN_2X32(vecIntRes1L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecIntRes1H = IVP_ADDN_2X32(vecIntRes1H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecOut1L    = IVP_SRAN_2X32(vecIntRes1L, (xb_vecN_2x32v) (shift));
+          vecOut1H    = IVP_SRAN_2X32(vecIntRes1H, (xb_vecN_2x32v) (shift));
+          vecOut1L    = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim);
+          vecOut1H    = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim);
+
+
+
+          /* Store output data */
+          IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAVN_2X32_XP(vecOut1L, vaOut, pvecOut, (varLen << 2));
+          IVP_SAVN_2X32_XP(vecOut1H, vaOut, pvecOut, ((varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)));
+          IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        int32_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1));
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          /* load input data */
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          xb_vecNx48 acc0 = zeroInScale;
+
+          IVP_MULUSANX16(acc0, vecScale, vecInData0);
+
+          xb_vecN_2x32v vecIntRes0L = IVP_CVT32SNX48L(acc0);
+          xb_vecN_2x32v vecIntRes0H = IVP_CVT32SNX48H(acc0);
+
+          vecIntRes0L = IVP_ADDN_2X32(vecIntRes0L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecIntRes0H = IVP_ADDN_2X32(vecIntRes0H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1)));
+          vecOut0L    = IVP_SRAN_2X32(vecIntRes0L, (xb_vecN_2x32v) (shift));
+          vecOut0H    = IVP_SRAN_2X32(vecIntRes0H, (xb_vecN_2x32v) (shift));
+          vecOut0L    = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim);
+          vecOut0H    = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim);
+
+
+          /* Store output data */
+          IVP_SAVN_2X32_XP(vecOut0L, vaOut, pvecOut, (varLen << 2));
+          IVP_SAVN_2X32_XP(vecOut0H, vaOut, pvecOut, ((varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)));
+          IVP_SAPOSN_2X32_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_AsymQ_S8I64 ********************/
+/* Description : Q8 implementation for conversion from S8_ASYM to I64_SYM */
+/* Inputs      : Input Tile, zeroIn, scale, shift                         */
+/* Outputs     : XI Error Code                                            */
+/* InOuts      : Output Tile                                              */
+/* Assumptions : InData is signed 8bit                                    */
+/**************************************************************************/
+XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8I64(const xai_pTile3D inTile,
+                                             xai_pTile3D outTile,
+                                             const int16_t zeroIn,
+                                             const uint16_t scale,
+                                             const uint8_t shift)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_TILE3D_I64(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \
+                    "Shift = %hhu, value should be less than 24", shift);
+    XAI_CHECK_ERROR((zeroIn >= -128) && (zeroIn < 128), XAI_ERR_NORM, \
+                    "\nzeroIn = %hi, value must be greater than or equal to -128 and less than 128", zeroIn);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG,                  \
+                    "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inTilePitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inTilePitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  valign vaOut = IVP_ZALIGN();
+
+  /* Get Data Pointers */
+  int8_t *pInput   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int64_t *pOutput = (int64_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  /* vectorization width */
+  const int32_t vectorizationWidth   = XCHAL_IVPN_SIMD_WIDTH;
+  const int32_t vectorizationWidth2X = vectorizationWidth * 2;
+  const int32_t vectorizationWidth3X = vectorizationWidth * 3;
+  const int32_t vectorizationWidth4X = vectorizationWidth * 4;
+
+  const int32_t minLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64)) ? INT_MIN : 0;
+  //S16 x U16 = S32 , rounded and shifted back to S32.
+  //even though the output type is S64. Data is within S32 range so INT_MIN is sufficient.
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  xb_vecNx8 * restrict pvecIn;
+  xb_vecN_2x64w * restrict pvecOut;
+
+  /* input and output data vectors */
+  xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3;
+  xb_vecN_2x64w vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H;
+
+  xb_vecNx16U vecScale   = (xb_vecNx16U) (scale);
+  xb_vecNx16 vecZeroIn   = (xb_vecNx16) (-zeroIn);
+  xb_vecNx48 zeroInScale = IVP_MULUSNX16(vecScale, vecZeroIn);
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When input tile pitch is equal to input tile dim1 and input tile pitch */
+  /*    is equal to output tile pitch                                           */
+  /*    - If above condition holds good, data elements for which data           */
+  /*      conversion from S8 bit to S64 bit need to done present in contiguous  */
+  /*      memory location. Hence vectorization can be utilized effectively      */
+  /*                                                                            */
+  /* 2. When input tile pitch is not equal to input tile size or input tile     */
+  /*    pitch is not equal to output tile pitch                                 */
+  /*    - In this scenario, data elements for which data conversion from S8 bit */
+  /*      S64 bit need to done exist in non-contiguous memory location.         */
+  /*      In order to do vectorization across first dimension, output data      */
+  /*      pointers need to be updated based on output tile size and output tile */
+  /*      pitch.                                                                */
+  /******************************************************************************/
+
+  if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* input and output vectors */
+    xb_vecNx16 vecInData;
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3Size;
+    int32_t maxLoopCount     = dim1Size * dim2Size;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount))
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+
+      /* Update max loop counter */
+      dim3MaxLoopCount = 1;
+      maxLoopCount    *= dim3Size;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      xb_vecN_2x32v vecOutTempL, vecOutTempH;
+      /* initialize input and output data pointer */
+      pvecIn  = (xb_vecNx8 *) (pInput + (z * inTilePitch2));
+      pvecOut = (xb_vecN_2x64w *) (pOutput + (z * outTilePitch2));
+      valign vaInData = IVP_LANX8S_PP(pvecIn);
+      int32_t varlen;
+
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* Load input data */
+        IVP_LANX8S_IP(vecInData, vaInData, pvecIn);
+
+        xb_vecNx48 acc = zeroInScale;
+        IVP_MULUSANX16(acc, vecScale, vecInData);
+
+        xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc), IVP_CVT64SNX48LL(acc));
+        vecOutTempL = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+        vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+        //sign extending to 64bit
+        vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+        xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc), IVP_CVT64SNX48HL(acc));
+        vecOutTempH = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+        vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+        //sign extending to 64bit
+        vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+        /* store output data */
+        IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut);
+        IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut);
+      }
+      varlen = (maxLoopCount - x);
+      IVP_LANX8S_IP(vecInData, vaInData, pvecIn);
+
+      xb_vecNx48 acc = zeroInScale;
+      IVP_MULUSANX16(acc, vecScale, vecInData);
+
+      xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc), IVP_CVT64SNX48LL(acc));
+      vecOutTempL = IVP_PACKVRN_2X64W(vecOutIntm1, shift);
+      vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+      //sign extending to 64bit
+      vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+      xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc), IVP_CVT64SNX48HL(acc));
+      vecOutTempH = IVP_PACKVRN_2X64W(vecOutIntm2, shift);
+      vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+      //sign extending to 64bit
+      vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+      /* store output data */
+      IVP_SAVN_2X64W_XP(vecOut0L, vaOut, pvecOut, (varlen << 3));
+      IVP_SAVN_2X64W_XP(vecOut0H, vaOut, pvecOut, ((varlen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+      IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+    }
+  }
+  else
+  {
+    /* else block is executed if input tile pitch is not equal to input tile width or input tile */
+    /* pitch is not equal to output tile pitch                                                   */
+
+    for (z = 0; z < dim3Size; z++)     /* along 3rd dimension */
+    {
+      xb_vecN_2x32v vecOutTempL, vecOutTempH;
+      x = 0;
+      /* Loop Unroll=4 along 1st dimension */
+      for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X)
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth3X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          /* load input data */
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData2, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData3, vaInData, pvecIn);
+
+          /* apply scale and shift to input data.
+           * multiplying with scale results in 32 way 48-bit
+           * data to which shift is applied, so final result is
+           * 32 way 16 bit.
+           */
+          xb_vecNx48 acc0, acc1, acc2, acc3;
+          acc0 = zeroInScale;
+          acc1 = zeroInScale;
+          acc2 = zeroInScale;
+          acc3 = zeroInScale;
+
+          IVP_MULUSANX16(acc0, vecScale, vecInData0);
+          IVP_MULUSANX16(acc1, vecScale, vecInData1);
+          IVP_MULUSANX16(acc2, vecScale, vecInData2);
+          IVP_MULUSANX16(acc3, vecScale, vecInData3);
+
+          vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc0), IVP_CVT64SNX48LL(acc0));
+          vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc0), IVP_CVT64SNX48HL(acc0));
+          vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc1), IVP_CVT64SNX48LL(acc1));
+          vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc1), IVP_CVT64SNX48HL(acc1));
+          vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc2), IVP_CVT64SNX48LL(acc2));
+          vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc2), IVP_CVT64SNX48HL(acc2));
+          vecOut3L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc3), IVP_CVT64SNX48LL(acc3));
+          vecOut3H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc3), IVP_CVT64SNX48HL(acc3));
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut1L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut2L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut2L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut2H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut2H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut3L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut3L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut3H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut3H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          /* Store output data */
+          IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut2L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut2H, vaOut, pvecOut);
+          IVP_SAVN_2X64W_XP(vecOut3L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut3H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+      if (x < (dim1Size - vectorizationWidth2X))
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth2X);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          /* load input data */
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData2, vaInData, pvecIn);
+
+          xb_vecNx48 acc0, acc1, acc2;
+          acc0 = zeroInScale;
+          acc1 = zeroInScale;
+          acc2 = zeroInScale;
+
+          IVP_MULUSANX16(acc0, vecScale, vecInData0);
+          IVP_MULUSANX16(acc1, vecScale, vecInData1);
+          IVP_MULUSANX16(acc2, vecScale, vecInData2);
+
+          vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc0), IVP_CVT64SNX48LL(acc0));
+          vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc0), IVP_CVT64SNX48HL(acc0));
+          vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc1), IVP_CVT64SNX48LL(acc1));
+          vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc1), IVP_CVT64SNX48HL(acc1));
+          vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc2), IVP_CVT64SNX48LL(acc2));
+          vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc2), IVP_CVT64SNX48HL(acc2));
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut1L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut2L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut2L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut2H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut2H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+
+          /* Store output data */
+          IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut);
+          IVP_SAVN_2X64W_XP(vecOut2L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut2H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < (dim1Size - vectorizationWidth))
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - (x + vectorizationWidth);
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          /* load input data */
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+          IVP_LANX8S_IP(vecInData1, vaInData, pvecIn);
+
+          xb_vecNx48 acc0, acc1;
+          acc0 = zeroInScale;
+          acc1 = zeroInScale;
+
+          IVP_MULUSANX16(acc0, vecScale, vecInData0);
+          IVP_MULUSANX16(acc1, vecScale, vecInData1);
+
+          vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc0), IVP_CVT64SNX48LL(acc0));
+          vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc0), IVP_CVT64SNX48HL(acc0));
+          vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc1), IVP_CVT64SNX48LL(acc1));
+          vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc1), IVP_CVT64SNX48HL(acc1));
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut1L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          /* Store output data */
+          IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut);
+          IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut);
+          IVP_SAVN_2X64W_XP(vecOut1L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut1H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+      else if (x < dim1Size)
+      {
+        /* Initialize input and output data pointer */
+        int8_t * pIn   = &pInput[z * inTilePitch2 + x];
+        int64_t *pOut  = &pOutput[z * outTilePitch2 + x];
+        int32_t varLen = dim1Size - x;
+
+        for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+        {
+          pvecIn  = (xb_vecNx8 *) (pIn + (y * inTilePitch1));
+          pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1));
+
+          valign vaInData = IVP_LANX8S_PP(pvecIn);
+          /* load input data */
+          IVP_LANX8S_IP(vecInData0, vaInData, pvecIn);
+
+          xb_vecNx48 acc0;
+          acc0 = zeroInScale;
+
+          IVP_MULUSANX16(acc0, vecScale, vecInData0);
+
+          vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc0), IVP_CVT64SNX48LL(acc0));
+          vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc0), IVP_CVT64SNX48HL(acc0));
+
+          vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift);
+          vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL);
+
+          vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift);
+          vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim);
+          //sign extending to 64bit
+          vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH);
+
+          /* Store output data */
+          IVP_SAVN_2X64W_XP(vecOut0L, vaOut, pvecOut, (varLen << 3));
+          IVP_SAVN_2X64W_XP(vecOut0H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2)));
+          IVP_SAPOSN_2X64W_FP(vaOut, pvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/********************* xaiDataConversion3D_AsymQ *********************************/
+/* Description : General API for DataConversion3D_AsymQ optimized implementation */
+/*               Calls one of the DataConversion3D_AsymQ functions based         */
+/*               on the parameters                                               */
+/* Inputs      : Input Tile, zeroPoint, scale, shift                             */
+/* Outputs     : XI Error Code                                                   */
+/* InOuts      : Output Tile                                                     */
+/*********************************************************************************/
+XAI_ERR_TYPE xaiDataConversion3D_AsymQ(const xai_pTile3D inTile,
+                                       xai_pTile3D outTile,
+                                       const int16_t zeroPoint,
+                                       const uint16_t scale,
+                                       const uint8_t shift)
+{
+  if ((!inTile) || (!outTile))
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+  {
+    if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8))
+    {
+      // Converts S8_SYM/S8_ASYM input to S8_SYM/S8_ASYM output (The "zeroPoint" used here serves as "fixUp" for the API)
+      return(xaiDataConversion3D_AsymQ_S8S8(inTile, outTile, zeroPoint, scale, shift));
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U8))
+    {
+      // Converts S8_ASYM input to U8_SYM output (The "zeroPoint" used here serves as "fixUp" for the API)
+      return(xaiDataConversion3D_AsymQ_S8U8(inTile, outTile, zeroPoint, scale, shift));
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16))
+    {
+      if (zeroPoint == 0)
+      {
+        return(xaiDataConversion3D_S8S16(inTile, outTile, scale, shift));
+      }
+      else
+      {
+        // Converts S8_ASYM input to S16_SYM output (The "zeroPoint" used here serves as "fixUp" for the API)
+        return(xaiDataConversion3D_AsymQ_S8S16(inTile, outTile, zeroPoint, scale, shift));
+      }
+    }
+    else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16))
+    {
+      // Converts S8_ASYM input to U16_SYM output (The "zeroPoint" used here serves as "fixUp" for the API)
+      return(xaiDataConversion3D_AsymQ_S8U16(inTile, outTile, zeroPoint, scale, shift));
+    }
+    else if ((XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U32)))
+    {
+      // Converts S8_ASYM input to I32 output (The "zeroPoint" used here serves as "ZeroIn" for the API)
+      return(xaiDataConversion3D_AsymQ_S8I32(inTile, outTile, zeroPoint, scale, shift));
+    }
+    else if ((XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U64)))
+    {
+      // Converts S8_ASYM input to I64 output (The "zeroPoint" used here serves as "ZeroIn" for the API)
+      return(xaiDataConversion3D_AsymQ_S8I64(inTile, outTile, zeroPoint, scale, shift));
+    }
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+  {
+    if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8))
+    {
+      if (zeroPoint == 0)
+      {
+        return(xaiDataConversion3D_U8S8(inTile, outTile, scale, shift));
+      }
+      else
+      {
+        // Converts U8_SYM input to S8_ASYM output (The "zeroPoint" used here serves as "ZeroOut" for the API)
+        return(xaiDataConversion3D_AsymQ_U8S8(inTile, outTile, zeroPoint, scale, shift));
+      }
+    }
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+  {
+    if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8))
+    {
+      if (zeroPoint == 0)
+      {
+        return(xaiDataConversion3D_S16I8(inTile, outTile, scale, shift));
+      }
+      else
+      {
+        // Converts S16_SYM input to S8_ASYM output (The "zeroPoint" used here serves as "ZeroOut" for the API)
+        return(xaiDataConversion3D_AsymQ_S16S8(inTile, outTile, zeroPoint, scale, shift));
+      }
+    }
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U16))
+  {
+    if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8))
+    {
+      if (zeroPoint == 0)
+      {
+        return(xaiDataConversion3D_U16I8(inTile, outTile, scale, shift));
+      }
+      else
+      {
+        // Converts U16_SYM input to S8_ASYM output (The "zeroPoint" used here serves as "ZeroOut" for the API)
+        // return(xaiDataConversion3D_U16AS8(inTile, outTile, zeroPoint, scale, shift));
+        return(xaiDataConversion3D_AsymQ_U16S8(inTile, outTile, zeroPoint, scale, shift));
+      }
+    }
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S32))
+  {
+    if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8))
+    {
+      if (zeroPoint == 0)
+      {
+        return(xaiDataConversion3D_S32S8(inTile, outTile, scale, shift));
+      }
+      else
+      {
+        // Converts S32_SYM input to S8_ASYM output (The "zeroPoint" used here serves as "ZeroOut" for the API)
+        return(xaiDataConversion3D_AsymQ_S32S8(inTile, outTile, zeroPoint, scale, shift));
+      }
+    }
+  }
+
+  return(XAI_ERR_NO_VARIANT);
+}
+#endif //if ((XCHAL_VISION_TYPE >= 6))
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD.c
new file mode 100644
index 00000000000..d10991e879c
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD.c
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+#include "limits.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#undef DILATED_VQ_CONV
+#include "cnn_dilated_conv_MOD.h"
+
+/******************************* end of MOD variants ***************************************/
+/*******************************************************************************************/
+#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD.h
new file mode 100644
index 00000000000..0c5dd9a0e33
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD.h
@@ -0,0 +1,16078 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+#include "limits.h"
+
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+/******************************************************************************************
+* MOD WHD DWH variants
+******************************************************************************************/
+
+/*****************************************************************************
+*  xaiConvolvedVQ3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for 1x1 MOD_WHD_DWH    */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate 1x1 MOD_WHD_DWH 3D     */
+/*               dilated convolution function and 1x1 MOD_WHD_DWH 3D VQ     */
+/*               dilated convolution function                               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 1x1xDxN                                     */
+/*               Input is in WHD and Output is in DWH format                */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 2) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \
+                    "Stride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) == 1), \
+                    XAI_ERR_BADARG, "\nDilation = %hhu\nDilation should be 1", XAI_CNN_CONV_GET_DILATION(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_CONSISTENCY_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    /* Max value of Gather Offset is (min(numInCh-1,7)*inDataPitch2 + stride*min(3,outWidth-1)) */
+    if (numInCh > 1)
+    {
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_PITCH(inTile) <                                                                       \
+                      ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) * XT_MIN(3, outW - 1)) / XT_MIN(numInCh - 1, 7)),            \
+                      XAI_ERR_BADARG, "\ndim2Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \
+                      XAI_TILE3D_GET_DIM2_PITCH(inTile),                                                                        \
+                      ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) * XT_MIN(3, outW - 1)) / XT_MIN(numInCh - 1, 7)));
+    }
+  }
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideU       = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitch of Coefficient Data (NDWH) in dim1 (W = 1 and H = 1) */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecOut;
+
+#if XCHAL_HAVE_SUPERGATHER == 0
+  xb_vec2Nx8* pdvecCoeff1;
+  xb_vec2Nx8* pdvecCoeff2;
+  valign vIn;
+  xb_vec2Nx8* pdvecIn1;
+  xb_vec2Nx8* pdvecIn2;
+
+  /*updating sel1 corresponding to 8 outCh and,4 width from input, hence
+     for 8 input channel and 4 width elements from each load selection,
+     sel1=0,64,0+strideU,64+strideU,0+2*strideU,64+2*strideU,0+3*strideU,64+3*strideU,0+4*strideU,64+4*strideU,...
+     ...0+7*strideU,64+7*strideU*/
+  xb_vec2Nx8U sel  = IVP_SEQ2NX8();
+  xb_vecNx16U off  = IVP_MULNX16PACKL(IVP_ANDNX16(1, IVP_SEQNX16()), 64);
+  xb_vec2Nx8U off1 = IVP_SEL2NX8I(IVP_MOV2NX8_FROMNX16(off), IVP_MOV2NX8_FROMNX16(off), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+  xb_vec2Nx8U sel1 = 0, sel2 = 0;
+  sel2 = IVP_SEL2NX8UI(IVP_MUL2NX8(IVP_SEQ2NX8U(), strideU), IVP_MUL2NX8(IVP_SEQ2NX8U(), strideU), IVP_SELI_8B_INTERLEAVE_1_LO);
+  sel2 = IVP_ADD2NX8U(sel2, off1);
+  IVP_SEL2NX8UT(sel1, 0, sel2, IVP_SEQ2NX8U(), IVP_LT2NX8(sel, 16));
+
+  xb_vec2Nx8 dvecIn  = 0, dvecIn1 = 0, dvecIn2 = 0, dvecIn3 = 0, dvecIn4 = 0;
+  xb_vec2Nx8 dvecIn5 = 0, dvecIn6 = 0, dvecIn7 = 0, dvecIn8 = 0;
+
+  /*implementation follows loading 8 input vectors corresponding to 8 inCh and ,first four elements
+     along width */
+
+  int32_t remainingInCh = numInCh - ((numInCh >> 3) << 3);
+
+  uint8_t remCh1   = 0, remCh2 = 0, remCh3 = 0, remCh4 = 0, remCh5 = 0, remCh6 = 0;
+  int32_t sumMask1 = 0, sumMask2 = 0;
+
+  if (remainingInCh != 0) /* if numInCh is not a multiple of 8*/
+  {
+    /* Generating Coefficient mask such that coefficient load happens only for valid channel number*/
+    /* Coefficient mask entries for channels greater than the remainingInCh are set to 0 */
+    /* Generating Coefficient mask such that coefficient load happens only for valid channel number*/
+    /* Coefficient mask entries for channels grea	ter than the remainingInCh are set to 0 */
+    remCh1 = XT_SALT(1, remainingInCh);
+    remCh2 = XT_SALT(2, remainingInCh);
+    remCh3 = XT_SALT(3, remainingInCh);
+    remCh4 = XT_SALT(4, remainingInCh);
+    remCh5 = XT_SALT(5, remainingInCh);
+    remCh6 = XT_SALT(6, remainingInCh);
+
+    /*Generation of maskLut for handling cases when remainingInCh is not equal to 0   */
+    /*eg. if remainingInCh is equal to 2 then sumMask1 is 00FFFFFF and sumMask2 is 0  */
+    /*    if remainingInCh is equal to 3 then sumMask1 is FFFFFFFF and sumMask2 is 0  */
+    /*    if remainingInCh is equal to 4 then sumMask1 is FFFFFFFF and sumMask2 is FF */
+    const uint32_t maskLut[4] = { 0xff, 0xff00, 0xff0000, 0xff000000 };
+
+    sumMask1 = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2 + maskLut[3] * remCh3;
+    sumMask2 = maskLut[0] * remCh4 + maskLut[1] * remCh5 + maskLut[2] * remCh6;
+  }
+
+  /* Unrolling of 4 is done along output width and 8 along input channels */
+  /**          Loop Starts            **/
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Along output channels*/
+  {
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y++)   /* Along output height*/
+    {
+      for (x = 0; x < outW; x += 4)   /*Along output width*/
+      {
+        /* Input Data and Output Data Pointers */
+        int8_t* pSrc = pInData + y * inDataPitch1 * strideU + x * strideU;
+        int8_t* pOut = &pOutData[(y * outDataPitch2 + x * outDataPitch1) * bytesPerPixel];
+
+        /*  For corner case handling  */
+        int32_t remainingX = XT_MIN(4, outW - x);
+
+        /* Loading bias and initializing sum with bias*/
+        xb_vec2Nx24 dvecSum0, dvecSum1, dvecSum2, dvecSum3;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, dvecSum0, dvecSum1, dvecSum2, dvecSum3);
+
+        /* Coefficient Pointer */
+        pdvecCoeff1 = (xb_vec2Nx8 *) (&pCoeffData[outCh]);
+        pdvecCoeff2 = (xb_vec2Nx8 *) (&pCoeffData[outCh] + coeffPitch1);
+        pdvecIn1    = (xb_vec2Nx8 *) pSrc;
+        pdvecIn2    = (xb_vec2Nx8 *) (pSrc + inDataPitch2);
+
+        for (inCh = 0; inCh < (numInCh - 7); inCh += 8)
+        {
+          /*Loading input vector */
+          vIn = IVP_LA2NX8_PP(pdvecIn1);
+          IVP_LA2NX8_XP(dvecIn1, vIn, pdvecIn1, 2 * inDataPitch2);
+
+          vIn = IVP_LA2NX8_PP(pdvecIn2);
+          IVP_LA2NX8_XP(dvecIn2, vIn, pdvecIn2, 2 * inDataPitch2);
+
+          vIn = IVP_LA2NX8_PP(pdvecIn1);
+          IVP_LA2NX8_XP(dvecIn3, vIn, pdvecIn1, 2 * inDataPitch2);
+
+          vIn = IVP_LA2NX8_PP(pdvecIn2);
+          IVP_LA2NX8_XP(dvecIn4, vIn, pdvecIn2, 2 * inDataPitch2);
+
+          vIn = IVP_LA2NX8_PP(pdvecIn1);
+          IVP_LA2NX8_XP(dvecIn5, vIn, pdvecIn1, 2 * inDataPitch2);
+
+          vIn = IVP_LA2NX8_PP(pdvecIn2);
+          IVP_LA2NX8_XP(dvecIn6, vIn, pdvecIn2, 2 * inDataPitch2);
+
+          vIn = IVP_LA2NX8_PP(pdvecIn1);
+          IVP_LA2NX8_XP(dvecIn7, vIn, pdvecIn1, 2 * inDataPitch2);
+
+          vIn = IVP_LA2NX8_PP(pdvecIn2);
+          IVP_LA2NX8_XP(dvecIn8, vIn, pdvecIn2, 2 * inDataPitch2);
+
+          /*dvecIn,dvecIn1 loaded with first 2 and next 2 elements of inChannels as x
+             is unrolled 4 times loaded as first element of dvecIn1,first element of dvecIn2....first element of dvecIn4,
+             second element of dvecIn1,second element of dvecIn2....second element of dvecIn4,
+             third element of dvecIn1,third element of dvecIn2....third element of dvecIn4,
+             fourth element of dvecIn1,fourth element of dvecIn2....fourth element of dvecIn4 for
+             dvecIn, for dvecIn2 next four elements of input*/
+          dvecIn  = IVP_SEL2NX8(dvecIn2, dvecIn1, sel1);
+          dvecIn2 = IVP_SEL2NX8(dvecIn4, dvecIn3, sel1);
+          dvecIn1 = IVP_SEL2NX8(dvecIn6, dvecIn5, sel1);
+          dvecIn3 = IVP_SEL2NX8(dvecIn8, dvecIn7, sel1);
+          dvecIn  = IVP_SEL2NX8I(dvecIn2, dvecIn, IVP_SELI_INTERLEAVE_1_LO);
+          dvecIn1 = IVP_SEL2NX8I(dvecIn3, dvecIn1, IVP_SELI_INTERLEAVE_1_LO);
+
+          /* 8 Coefficient Vector Loads */
+          /* Load Coefficients to vector - coefficients already aligned  */
+          xb_vec2Nx8 dvecCoeff0;
+          IVP_LV2NX8_XP(dvecCoeff0, pdvecCoeff1, 2 * coeffPitch1);
+
+          xb_vec2Nx8 dvecCoeff1;
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, 2 * coeffPitch1);
+
+          xb_vec2Nx8 dvecCoeff2;
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, 2 * coeffPitch1);
+
+          xb_vec2Nx8 dvecCoeff3;
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, 2 * coeffPitch1);
+
+          xb_vec2Nx8 dvecCoeff4;
+          IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff1, 2 * coeffPitch1);
+
+          xb_vec2Nx8 dvecCoeff5;
+          IVP_LV2NX8_XP(dvecCoeff5, pdvecCoeff2, 2 * coeffPitch1);
+
+          xb_vec2Nx8 dvecCoeff6;
+          IVP_LV2NX8_XP(dvecCoeff6, pdvecCoeff1, 2 * coeffPitch1);
+
+          xb_vec2Nx8 dvecCoeff7;
+          IVP_LV2NX8_XP(dvecCoeff7, pdvecCoeff2, 2 * coeffPitch1);
+
+
+          /* Load 4 bytes(4 channels) of input data along the depth to int32_t scalar */
+          xb_vecN_2x32v hvecIn  = IVP_MOVN_2X32_FROM2NX8(dvecIn);
+          xb_vecN_2x32v hvecIn1 = IVP_MOVN_2X32_FROM2NX8(dvecIn1);
+
+          int32_t scalarInData0 = IVP_EXTRN_2X32(hvecIn, 0);
+          int32_t scalarInData1 = IVP_EXTRN_2X32(hvecIn1, 0);
+
+          int32_t scalarInData2 = IVP_EXTRN_2X32(hvecIn, 1);
+          int32_t scalarInData3 = IVP_EXTRN_2X32(hvecIn1, 1);
+
+          int32_t scalarInData4 = IVP_EXTRN_2X32(hvecIn, 2);
+          int32_t scalarInData5 = IVP_EXTRN_2X32(hvecIn1, 2);
+
+          int32_t scalarInData6 = IVP_EXTRN_2X32(hvecIn, 3);
+          int32_t scalarInData7 = IVP_EXTRN_2X32(hvecIn1, 3);
+
+          /* Multiply and accumulate */
+          IVP_MULQA2N8XR8(dvecSum0, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData0);
+          IVP_MULQA2N8XR8(dvecSum1, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData2);
+          IVP_MULQA2N8XR8(dvecSum2, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData4);
+          IVP_MULQA2N8XR8(dvecSum3, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData6);
+
+          IVP_MULQA2N8XR8(dvecSum0, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData1);
+          IVP_MULQA2N8XR8(dvecSum1, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData3);
+          IVP_MULQA2N8XR8(dvecSum2, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData5);
+          IVP_MULQA2N8XR8(dvecSum3, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData7);
+        } /* end of for(inCh = 0; inCh < numInCh; inCh+=8)*/
+
+        if (inCh < numInCh)
+        {
+          /*Loading input vector */
+          vIn = IVP_LA2NX8_PP(pdvecIn1);
+          IVP_LA2NX8_XP(dvecIn1, vIn, pdvecIn1, inDataPitch2 * remCh1);
+
+          vIn = IVP_LA2NX8_PP(pdvecIn1);
+          IVP_LA2NX8_XP(dvecIn2, vIn, pdvecIn1, inDataPitch2 * remCh2);
+
+          vIn = IVP_LA2NX8_PP(pdvecIn1);
+          IVP_LA2NX8_XP(dvecIn3, vIn, pdvecIn1, inDataPitch2 * remCh3);
+
+          vIn = IVP_LA2NX8_PP(pdvecIn1);
+          IVP_LA2NX8_XP(dvecIn4, vIn, pdvecIn1, inDataPitch2 * remCh4);
+
+          vIn = IVP_LA2NX8_PP(pdvecIn1);
+          IVP_LA2NX8_XP(dvecIn5, vIn, pdvecIn1, inDataPitch2 * remCh5);
+
+          vIn = IVP_LA2NX8_PP(pdvecIn1);
+          IVP_LA2NX8_XP(dvecIn6, vIn, pdvecIn1, inDataPitch2 * remCh6);
+
+          vIn = IVP_LA2NX8_PP(pdvecIn1);
+          IVP_LA2NX8_XP(dvecIn7, vIn, pdvecIn1, inDataPitch2);
+
+          dvecIn  = IVP_SEL2NX8(dvecIn2, dvecIn1, sel1);
+          dvecIn2 = IVP_SEL2NX8(dvecIn4, dvecIn3, sel1);
+          dvecIn1 = IVP_SEL2NX8(dvecIn6, dvecIn5, sel1);
+          dvecIn3 = IVP_SEL2NX8(dvecIn8, dvecIn7, sel1);
+          dvecIn  = IVP_SEL2NX8I(dvecIn2, dvecIn, IVP_SELI_INTERLEAVE_1_LO);
+          dvecIn1 = IVP_SEL2NX8I(dvecIn3, dvecIn1, IVP_SELI_INTERLEAVE_1_LO);
+
+          /* Load 4 bytes(4 channels) of input data along the depth to int32_t scalar */
+          xb_vecN_2x32v hvecIn  = IVP_MOVN_2X32_FROM2NX8(dvecIn);
+          xb_vecN_2x32v hvecIn1 = IVP_MOVN_2X32_FROM2NX8(dvecIn1);
+
+          int32_t scalarInData0 = IVP_EXTRN_2X32(hvecIn, 0);
+          int32_t scalarInData1 = IVP_EXTRN_2X32(hvecIn1, 0);
+
+          int32_t scalarInData2 = IVP_EXTRN_2X32(hvecIn, 1);
+          int32_t scalarInData3 = IVP_EXTRN_2X32(hvecIn1, 1);
+
+          int32_t scalarInData4 = IVP_EXTRN_2X32(hvecIn, 2);
+          int32_t scalarInData5 = IVP_EXTRN_2X32(hvecIn1, 2);
+
+          int32_t scalarInData6 = IVP_EXTRN_2X32(hvecIn, 3);
+          int32_t scalarInData7 = IVP_EXTRN_2X32(hvecIn1, 3);
+
+          /* 8 Coefficient Vector Loads */
+          /* Load Coefficients to vector - coefficients already aligned  */
+          xb_vec2Nx8 dvecCoeff0;
+          IVP_LV2NX8_XP(dvecCoeff0, pdvecCoeff1, coeffPitch1 * remCh1);
+
+          xb_vec2Nx8 dvecCoeff1;
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1 * remCh2);
+
+          xb_vec2Nx8 dvecCoeff2;
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1 * remCh3);
+
+          xb_vec2Nx8 dvecCoeff3;
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, coeffPitch1 * remCh4);
+
+          xb_vec2Nx8 dvecCoeff4;
+          IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff1, coeffPitch1 * remCh5);
+
+          xb_vec2Nx8 dvecCoeff5;
+          IVP_LV2NX8_XP(dvecCoeff5, pdvecCoeff1, coeffPitch1 * remCh6);
+
+          xb_vec2Nx8 dvecCoeff6;
+          IVP_LV2NX8_XP(dvecCoeff6, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply and accumulate */
+          /* Masking the scalarInData to avoid accumulation with unintended values*/
+          IVP_MULQA2N8XR8(dvecSum0, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData0 & sumMask1);
+          IVP_MULQA2N8XR8(dvecSum1, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData2 & sumMask1);
+          IVP_MULQA2N8XR8(dvecSum2, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData4 & sumMask1);
+          IVP_MULQA2N8XR8(dvecSum3, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData6 & sumMask1);
+
+          IVP_MULQA2N8XR8(dvecSum0, 0, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData1 & sumMask2);
+          IVP_MULQA2N8XR8(dvecSum1, 0, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData3 & sumMask2);
+          IVP_MULQA2N8XR8(dvecSum2, 0, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData5 & sumMask2);
+          IVP_MULQA2N8XR8(dvecSum3, 0, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData7 & sumMask2);
+        } /* end of if (inCh < numInCh)*/
+
+        /* Storing output vector to memory */
+        xb_vec2Nx8 dvecOutData0L, dvecOutData1L, dvecOutData2L, dvecOutData3L;
+        xb_vec2Nx8 dvecOutData0H, dvecOutData1H, dvecOutData2H, dvecOutData3H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData0L, dvecOutData0H, dvecSum0, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData1L, dvecOutData1H, dvecSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData2L, dvecOutData2H, dvecSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData3L, dvecOutData3H, dvecSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData0L, dvecOutData0H, dvecSum0, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData1L, dvecOutData1H, dvecSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData2L, dvecOutData2H, dvecSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData3L, dvecOutData3H, dvecSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        pdvecOut = (xb_vec2Nx8 *) &pOut[outCh * bytesPerPixel];
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOutData0L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOutData0H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 1)];
+        IVP_SAV2NX8_XP(dvecOutData1L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh * XT_SALT(0, remainingX - 1));
+        IVP_SAV2NX8_XP(dvecOutData1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 1));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + 2 * outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 2)];
+        IVP_SAV2NX8_XP(dvecOutData2L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh * XT_SALT(0, remainingX - 2));
+        IVP_SAV2NX8_XP(dvecOutData2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 2));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + 3 * outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 3)];
+        IVP_SAV2NX8_XP(dvecOutData3L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh * XT_SALT(0, remainingX - 3));
+        IVP_SAV2NX8_XP(dvecOutData3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 3));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* end of for(x = 0; x < outW; x+=4)*/
+    }   /* end of for(y = 0; y < outH; y++)*/
+  }     /* end of for(outCh = 0; outCh < numOutCh; outCh+=2*XCHAL_IVPN_SIMD_WIDTH)*/
+
+#else
+#ifdef __XCC__
+  XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+  xb_vec2Nx8* restrict pdvecCoeff;
+
+  /* This implementation uses gather operation to load 4 bytes of data each from 8 channels */
+
+  /*****     Gather Offset Computation -  8channels, 4cols, 1row   *****/
+  /*offset = pitch*[0 1 2 3 4 5 6 7 ... 0 1 2 3 4 5 6 7] +             */
+  /*        stride*[0 0 0 0 0 0 0 0 ... 3 3 3 3 3 3 3 3]               */
+  /* where [0 0 0 0 0 0 0 0 ... 3 3 3 3 3 3 3 3] =>> column indices    */
+  /*       [0 1 2 3 4 5 6 7 ... 0 1 2 3 4 5 6 7] =>> channel indices   */
+  xb_vecNx16U vecOffsets0 = IVP_MULNX16PACKL(IVP_ANDNX16(7, IVP_SEQNX16()), inDataPitch2);
+  IVP_MULANX16PACKL(vecOffsets0, IVP_SRLINX16(IVP_SEQNX16(), 3), strideU);
+
+
+  /*******           Gather Offset Computation and Coeff Mask           ********/
+  /*******  for Corner Case : (InCh < numInCh) && (InCh > (numInCh -7)) ********/
+
+  int32_t remainingInCh = numInCh - ((numInCh >> 3) << 3);
+
+  xb_vecNx16U vecOffsets1 = (xb_vecNx16U) 0;
+  uint8_t remCh1          = 0, remCh2 = 0, remCh3 = 0, remCh4 = 0, remCh5 = 0, remCh6 = 0;
+  int32_t sumMask1        = 0, sumMask2 = 0;
+
+  if (remainingInCh != 0) /* if numInCh is not a multiple of 8*/
+  {
+    /* Generating Coefficient mask such that coefficient load happens only for valid channel number*/
+    /* Coefficient mask entries for channels greater than the remainingInCh are set to 0 */
+
+    /* Finding the gather offset such that valid memory locations are accessed       */
+    /* [0 1 2 3 4 5 6 7 ... 0 1 2 3 4 5 6 7] in offset calculation is modified such  */
+    /* that columns greater than (remainingInCh-1) are set to (remainingInCh-1)      */
+    xb_vecNx16 vecRemainingInChIdx = IVP_MINNX16(IVP_ANDNX16(7, IVP_SEQNX16()), remainingInCh - 1);
+    vecOffsets1 = IVP_MULNX16PACKL(vecRemainingInChIdx, inDataPitch2);
+    IVP_MULANX16PACKL(vecOffsets1, IVP_SRLINX16(IVP_SEQNX16(), 3), strideU);
+
+    /* Generating Coefficient mask such that coefficient load happens only for valid channel number*/
+    /* Coefficient mask entries for channels greater than the remainingInCh are set to 0 */
+    remCh1 = XT_SALT(1, remainingInCh);
+    remCh2 = XT_SALT(2, remainingInCh);
+    remCh3 = XT_SALT(3, remainingInCh);
+    remCh4 = XT_SALT(4, remainingInCh);
+    remCh5 = XT_SALT(5, remainingInCh);
+    remCh6 = XT_SALT(6, remainingInCh);
+
+    /*Generation of maskLut for handling cases when remainingInCh is not equal to 0   */
+    /*eg. if remainingInCh is equal to 2 then sumMask1 is 00FFFFFF and sumMask2 is 0  */
+    /*    if remainingInCh is equal to 3 then sumMask1 is FFFFFFFF and sumMask2 is 0  */
+    /*    if remainingInCh is equal to 4 then sumMask1 is FFFFFFFF and sumMask2 is FF */
+    const uint32_t maskLut[4] = { 0xff, 0xff00, 0xff0000, 0xff000000 };
+
+    sumMask1 = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2 + maskLut[3] * remCh3;
+    sumMask2 = maskLut[0] * remCh4 + maskLut[1] * remCh5 + maskLut[2] * remCh6;
+  }
+
+  /* Unrolling of 4 is done along output width and 8 along input channels */
+  /**          Loop Starts            **/
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Along output channels*/
+  {
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y++)   /* Along output height*/
+    {
+      for (x = 0; x < outW; x += 4)   /*Along output width*/
+      {
+        xb_vecNx16U vecOffsets2;
+        xb_vecNx16U vecOffsets3;
+        /* Input Data and Output Data Pointers */
+        int8_t* pSrc = pInData + y * inDataPitch1 * strideU + x * strideU;
+        int8_t* pOut = &pOutData[(y * outDataPitch2 + x * outDataPitch1) * bytesPerPixel];
+
+        /*  For corner case handling  */
+        int32_t remainingX  = XT_MIN(4, outW - x);
+        vboolN vbOffsetMask = IVP_LTRSN(8 * remainingX);   /*8 channels*/
+        /* Assign valid address for predicated false lines */
+        vecOffsets2 = IVP_MOVNX16UT(vecOffsets0, 0, vbOffsetMask);
+        vecOffsets3 = IVP_MOVNX16UT(vecOffsets1, 0, vbOffsetMask);
+        /* Loading bias and initializing sum with bias*/
+        xb_vec2Nx24 dvecSum0, dvecSum1, dvecSum2, dvecSum3;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, dvecSum0, dvecSum1, dvecSum2, dvecSum3);
+
+        /* Coefficient Pointer */
+        pdvecCoeff = (xb_vec2Nx8 *) (&pCoeffData[outCh]);
+
+        for (inCh = 0; inCh < (numInCh - 7); inCh += 8)
+        {
+          /* Gather Operation to load 8 channels of 1x4 block of input . dvecIn will contain data  */
+          /* from 8 channels corresponding to same x and y value in consecutive positions.         */
+          xb_gsr gatherReg  = IVP_GATHERANX8S(pSrc + inCh * inDataPitch2, vecOffsets2);
+          xb_vec2Nx8 dvecIn = IVP_GATHERD2NX8_L(gatherReg);  /* LSB 8 bits of gatherReg contain the desired data*/
+
+          /* 8 Coefficient Vector Loads */
+          /* Load Coefficients to vector - coefficients already aligned  */
+          xb_vec2Nx8 dvecCoeff0;
+          IVP_LV2NX8_XP(dvecCoeff0, pdvecCoeff, coeffPitch1);
+
+          xb_vec2Nx8 dvecCoeff1;
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+
+          xb_vec2Nx8 dvecCoeff2;
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+
+          xb_vec2Nx8 dvecCoeff3;
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+          xb_vec2Nx8 dvecCoeff4;
+          IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+          xb_vec2Nx8 dvecCoeff5;
+          IVP_LV2NX8_XP(dvecCoeff5, pdvecCoeff, coeffPitch1);
+
+          xb_vec2Nx8 dvecCoeff6;
+          IVP_LV2NX8_XP(dvecCoeff6, pdvecCoeff, coeffPitch1);
+
+          xb_vec2Nx8 dvecCoeff7;
+          IVP_LV2NX8_XP(dvecCoeff7, pdvecCoeff, coeffPitch1);
+
+
+          /* Load 4 bytes(4 channels) of input data along the depth to int32_t scalar */
+          int32_t scalarInData0 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 0);
+          int32_t scalarInData1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 1);
+
+          int32_t scalarInData2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 2);
+          int32_t scalarInData3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 3);
+
+          int32_t scalarInData4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 4);
+          int32_t scalarInData5 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 5);
+
+          int32_t scalarInData6 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 6);
+          int32_t scalarInData7 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 7);
+
+          /* Multiply and accumulate */
+          IVP_MULQA2N8XR8(dvecSum0, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData0);
+          IVP_MULQA2N8XR8(dvecSum1, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData2);
+          IVP_MULQA2N8XR8(dvecSum2, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData4);
+          IVP_MULQA2N8XR8(dvecSum3, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData6);
+
+          IVP_MULQA2N8XR8(dvecSum0, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData1);
+          IVP_MULQA2N8XR8(dvecSum1, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData3);
+          IVP_MULQA2N8XR8(dvecSum2, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData5);
+          IVP_MULQA2N8XR8(dvecSum3, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData7);
+        } /* end of for(inCh = 0; inCh < numInCh; inCh+=8)*/
+
+        if (inCh < numInCh)
+        {
+          /* Gather Operation to load remainingCh number of channels corresponding to 1x4 block */
+          /* of input. The channels to be loaded are handled by vecOffsets1 */
+          xb_gsr gatherReg  = IVP_GATHERANX8S(pSrc + inCh * inDataPitch2, vecOffsets3);
+          xb_vec2Nx8 dvecIn = IVP_GATHERD2NX8_L(gatherReg); /* LSB 8 bits of gatherReg contain the desired data*/
+
+          /* Load 4 bytes of input data along the depth to int32_t scalar */
+          int32_t scalarInData0 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 0);
+          int32_t scalarInData1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 1);
+
+          int32_t scalarInData2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 2);
+          int32_t scalarInData3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 3);
+
+          int32_t scalarInData4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 4);
+          int32_t scalarInData5 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 5);
+
+          int32_t scalarInData6 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 6);
+          int32_t scalarInData7 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 7);
+
+          /* 8 Coefficient Vector Loads */
+          /* Load Coefficients to vector - coefficients already aligned  */
+          xb_vec2Nx8 dvecCoeff0;
+          IVP_LV2NX8_XP(dvecCoeff0, pdvecCoeff, coeffPitch1 * remCh1);
+
+          xb_vec2Nx8 dvecCoeff1;
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh2);
+
+          xb_vec2Nx8 dvecCoeff2;
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh3);
+
+          xb_vec2Nx8 dvecCoeff3;
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1 * remCh4);
+
+          xb_vec2Nx8 dvecCoeff4;
+          IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 * remCh5);
+
+          xb_vec2Nx8 dvecCoeff5;
+          IVP_LV2NX8_XP(dvecCoeff5, pdvecCoeff, coeffPitch1 * remCh6);
+          xb_vec2Nx8 dvecCoeff6;
+          IVP_LV2NX8_XP(dvecCoeff6, pdvecCoeff, coeffPitch1);
+
+          /* Multiply and accumulate */
+          /* Masking the scalarInData to avoid accumulation with unintended values*/
+          IVP_MULQA2N8XR8(dvecSum0, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData0 & sumMask1);
+          IVP_MULQA2N8XR8(dvecSum1, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData2 & sumMask1);
+          IVP_MULQA2N8XR8(dvecSum2, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData4 & sumMask1);
+          IVP_MULQA2N8XR8(dvecSum3, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData6 & sumMask1);
+
+          IVP_MULQA2N8XR8(dvecSum0, 0, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData1 & sumMask2);
+          IVP_MULQA2N8XR8(dvecSum1, 0, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData3 & sumMask2);
+          IVP_MULQA2N8XR8(dvecSum2, 0, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData5 & sumMask2);
+          IVP_MULQA2N8XR8(dvecSum3, 0, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData7 & sumMask2);
+        } /* end of if (inCh < numInCh)*/
+
+        /* Storing output vector to memory */
+        xb_vec2Nx8 dvecOutData0L, dvecOutData1L, dvecOutData2L, dvecOutData3L;
+        xb_vec2Nx8 dvecOutData0H, dvecOutData1H, dvecOutData2H, dvecOutData3H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData0L, dvecOutData0H, dvecSum0, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData1L, dvecOutData1H, dvecSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData2L, dvecOutData2H, dvecSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData3L, dvecOutData3H, dvecSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData0L, dvecOutData0H, dvecSum0, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData1L, dvecOutData1H, dvecSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData2L, dvecOutData2H, dvecSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData3L, dvecOutData3H, dvecSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        pdvecOut = (xb_vec2Nx8 *) &pOut[outCh * bytesPerPixel];
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOutData0L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOutData0H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 1)];
+        IVP_SAV2NX8_XP(dvecOutData1L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh * XT_SALT(0, remainingX - 1));
+        IVP_SAV2NX8_XP(dvecOutData1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 1));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + 2 * outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 2)];
+        IVP_SAV2NX8_XP(dvecOutData2L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh * XT_SALT(0, remainingX - 2));
+        IVP_SAV2NX8_XP(dvecOutData2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 2));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + 3 * outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 3)];
+        IVP_SAV2NX8_XP(dvecOutData3L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh * XT_SALT(0, remainingX - 3));
+        IVP_SAV2NX8_XP(dvecOutData3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 3));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* end of for(x = 0; x < outW; x+=4)*/
+    }   /* end of for(y = 0; y < outH; y++)*/
+  }     /* end of for(outCh = 0; outCh < numOutCh; outCh+=2*XCHAL_IVPN_SIMD_WIDTH)*/
+#endif
+
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolvedVQ3D_S_2x2_S8S8IXCa2_MOD_WHD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for 2x2 MOD_WHD_DWH    */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate 2x2 MOD_WHD_DWH 3D     */
+/*               dilated convolution function and 2x2 MOD_WHD_DWH 3D VQ     */
+/*               dilated convolution function                               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 2x2xDxN                                     */
+/*               Input is in WHD and Output is in DWH format                */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_2x2_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_2x2_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 2);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 2) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \
+                    "Stride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) > 0),                                 \
+                    XAI_ERR_BADARG, "\nDilation = %hhu, value should be greater than zero", \
+                    XAI_CNN_CONV_GET_DILATION(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_EDGES_MOD_WHD(inTile, coeffTile, param);
+    XAI_CHECK_CONSISTENCY_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    if (XAI_CNN_CONV_GET_DILATION(param) > 1)
+    {
+      XAI_CHECK_ERROR(XAI_CNN_CONV_GET_STRIDE(param) == 1,                                                                                  \
+                      XAI_ERR_BADARG, "\nStride = %hhu, Dilation = %hhu\nWhen dilation parameter is more than 1 stride always has to be 1", \
+                      XAI_CNN_CONV_GET_STRIDE(param), XAI_CNN_CONV_GET_DILATION(param));
+    }
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    if (numInCh > 1)
+    {
+      /* Max value of Gather Offset is (stride* inDataPitch1)+ stride + (min(numInCh-1,3)*inDataPitch2 + dilation) */
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_PITCH(inTile) <                                                                         \
+                      ((USHRT_MAX - (XAI_CNN_CONV_GET_STRIDE(param) * XAI_TILE3D_GET_DIM1_PITCH(inTile) *                         \
+                                     XT_MIN(1, outH - 1)) - XAI_CNN_CONV_GET_STRIDE(param)) - XAI_CNN_CONV_GET_DILATION(param)) / \
+                      XT_MIN(numInCh - 1, 3),                                                                                     \
+                      XAI_ERR_BADARG, "\ndim2Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d",   \
+                      XAI_TILE3D_GET_DIM2_PITCH(inTile),                                                                          \
+                      ((USHRT_MAX - (XAI_CNN_CONV_GET_STRIDE(param) * XAI_TILE3D_GET_DIM1_PITCH(inTile) *                         \
+                                     XT_MIN(1, outH - 1)) - XAI_CNN_CONV_GET_STRIDE(param)) - XAI_CNN_CONV_GET_DILATION(param)) / \
+                      XT_MIN(numInCh - 1, 3));
+    }
+  }
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+  const uint8_t dilation      = XAI_CNN_CONV_GET_DILATION(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+  const int32_t kSizeU      = XAI_TILE4D_GET_DIM3(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  int32_t dilatedKWidthU      = dilation * (kSizeU - 1) + 1;
+  int32_t dilatedKHeightU     = dilation * (kSizeU - 1) + 1;
+  int32_t leftEdge, topEdge;
+
+  if ((dilatedKWidthU % 2) != 0)
+  {
+    leftEdge = dilatedKWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1);
+  }
+
+  if ((dilatedKHeightU % 2) != 0)
+  {
+    topEdge = dilatedKHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1);
+  }
+
+  /* move to start of edge data including edges */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+  valign vaOutData = IVP_ZALIGN();
+
+  /* Only one Gather is used in the inner most loop in this
+   * approach to get the Input Data for 4 Output Vectors.
+   * In every Gather, 32 elements are read, where first 16
+   * of them correspond to two vectors of Output along the width
+   * and the other  16 of them correspond to two vectors of Output
+   * along the height. To get the index values for the Gather,
+   * the following calculations are made.
+   */
+
+  /* Sequence - 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ... */
+  xb_vecNx16 vecGather0123 = IVP_ANDNX16(IVP_SEQNX16(), 3);
+  xb_vecNx16 vecSelIdx     = IVP_SEQNX16();
+  /* To get the Select indexes as - 0 1 2 3 4...7 32 33 34 35 36.... */
+  IVP_ADDNX16T(vecSelIdx, vecSelIdx, 24, IVP_NOTBN(IVP_LTRNI(8)));
+  /* To get - 0 0 0 0 d*1 d*1 d*1 d*1 d*2 d*2 d*2 d*2 d*3 d*3 d*3 d*3... */
+  xb_vecNx16U vecGatherOff = IVP_SRLINX16(IVP_SEQNX16(), 2);
+  vecGatherOff = IVP_MULNX16UPACKL(vecGatherOff, (uint16_t) dilation);
+  /* Sequence - 0 P2 2*P2 3*P2 d*1 P2+d*1 2*P2+d*1 3*P2+d*1 d*2 P2+d*2 2*P2+d*2 3*P2+d*2 .. */
+  IVP_MULANX16PACKL(vecGatherOff, vecGather0123, inDataPitch2);
+  vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride), \
+                             vecGatherOff, vecSelIdx);
+  vecSelIdx = IVP_SEQNX16();
+  IVP_ADDNX16T(vecSelIdx, vecSelIdx, 16, IVP_NOTBN(IVP_LTRNI(16)));
+  vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride * inDataPitch1), vecGatherOff, vecSelIdx);
+
+  /*
+     The generated sequence is:
+   * 0               P2            2*P2           3*P2
+   * d               P2+d          2*P2+d         3*P2+d
+   * s               s+P2          s+2*P2         s+3*P2
+   * s+d*1           s+P2+d        s+2*P2+d       s+3*P2+d
+   * (s*P1)+0       (s*P1)+P2      (s*P1)+2*P2    (s*P1)+3*P2
+   * (s*P1)+d       (s*P1)+P2+d   (s*P1)+2*P2+d   (s*P1)+3*P2+d
+   * (s*P1)+s       (s*P1)+s+P2     (s*P1)+s+2*P2 (s*P1)+s+3*P2
+   * (s*P1)+s+d     (s*P1)+s+P2+d (s*P1)+s+2*P2+d (s*P1)+s+3*P2+d
+   */
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecOut;
+  int8_t*     restrict pData1;
+  int8_t*     restrict pData2;
+
+
+  int32_t remInCh = numInCh & 3;
+
+  /*Generation of maskLut for handling cases when remInCh is not equal to 0   */
+  /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF  */
+  /*    if remInCh is equal to 2 then sumMask is 00FFFFFF  */
+  const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 };
+
+  uint8_t remCh1 = XT_SALT(2, remInCh + 1);
+  uint8_t remCh2 = XT_SALT(3, remInCh + 1);
+
+  uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2;
+
+#ifdef __XCC__
+  XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+  /* Unrolled by 2 along both Output Width and Output Height.
+   * Also, unrolled along Input Channels by 4 and completely
+   * along the Kernel Width. Gathers are used for loading Input Data.
+   */
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output channels */
+  {                                                                     /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = (numOutCh - outCh);
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable used to handle the corner case of OutHeight being odd */
+      int32_t numY = XT_MIN(2, outH - y) - 1;
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        xb_vecNx16U vecGatherOff1;
+
+        /* Variable used to handle the corner case of Output Width being odd */
+        int32_t numX = XT_MIN(2, outW - x) - 1;
+
+        /* Output, Input and Coefficient Data Pointers */
+        int8_t *pOut   = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int8_t *pData  = pInData + (x * stride) + (y * stride) * inDataPitch1;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Boolean vectors to handle the corner cases of Out Width and Height being odd */
+        vboolN vbXY = IVP_LTRSN((16 * numY) + 8 * (numX + 1));
+
+        /* Initialise input data pointers */
+        pData1 = pData;
+        pData2 = pData + (dilation * inDataPitch1);
+
+        /* Initialise co-efficient pointer */
+        pdvecCoeff = (xb_vec2Nx8 *) (pCoeff);
+
+        /* Assign gather offset considering corner cases of odd output height and width */
+        vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, vbXY);
+
+        for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels Loop */
+        {
+          /* Gather Input Data corresponding to ky=0 */
+          xb_gsr gather1       = IVP_GATHERANX8S(pData1, vecGatherOff1);
+          xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1);
+
+          /* Gather Input Data corresponding to ky=1 */
+          xb_gsr gather2       = IVP_GATHERANX8S(pData2, vecGatherOff1);
+          xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2);
+
+          /* kx = 0, ky =0 */
+          /* Extracting scalar integers for QMULs */
+          int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+          int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 2);
+          int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 4);
+          int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 6);
+
+          /* 4 Aligned Vector Loads of coefficients */
+          xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, \
+                                               coeffPitch2 - (3 * coeffPitch1));
+
+          IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+          IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+          IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+          /* kx = 1, ky = 0*/
+          /* Extracting scalar integers for QMULs */
+          qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                       1);
+          qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                       3);
+          qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                       5);
+          qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                       7);
+
+          /* 4 Aligned Vector Loads of coefficients */
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, -3 * coeffPitch1 - coeffPitch2 + coeffPitch3);
+
+          IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+          IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+          IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+          /* kx = 0, ky =1 */
+          /* Extracting scalar integers for QMULs */
+          qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                         (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+          qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                         (IVP_MOVNX16_FROM2NX8(dvecData2)), 2);
+          qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                         (IVP_MOVNX16_FROM2NX8(dvecData2)), 4);
+          qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                         (IVP_MOVNX16_FROM2NX8(dvecData2)), 6);
+
+          /* 4 Aligned Vector Loads of coefficients */
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - (3 * coeffPitch1));
+
+          IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+          IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+          IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+
+          /* kx = 1, ky = 1*/
+          /* Extracting scalar integers for QMULs */
+          qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                       1);
+          qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                       3);
+          qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                       5);
+          qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                       7);
+
+          /* 4 Aligned Vector Loads of coefficients */
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 - coeffPitch2 - coeffPitch3);
+
+          IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+          IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+          IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          pData1 += (4 * inDataPitch2);
+          pData2 += (4 * inDataPitch2);
+        } /* End Input Channels */
+
+        /* Handling Corner cases of Number of Input Channels not being multiple of 4 */
+        if (remInCh)
+        {
+          vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh);
+
+          /* Gather Input Data */
+          xb_vec2Nx8 dvecData1 = 0;
+
+          /* Assign valid address for predicated false lines */
+          vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbXY));
+          xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1);
+          dvecData1 = IVP_GATHERD2NX8_L(gather1);
+
+          /* Gather Input Data */
+          xb_vec2Nx8 dvecData2 = 0;
+          xb_gsr gather2       = IVP_GATHERANX8S(pData2, vecGatherOff1);
+          dvecData2 = IVP_GATHERD2NX8_L(gather2);
+
+          /* kx = 0, ky = 0 */
+          /* Extracting scalar integers for QMULs */
+          int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+          int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 2);
+          int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 4);
+          int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 6);
+
+          /* Aligned Vector Loads of coefficients */
+          xb_vec2Nx8 dvecCoeff1;
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+          xb_vec2Nx8 dvecCoeff2;
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+          xb_vec2Nx8 dvecCoeff3;
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2)));
+
+          /* Masking the qmulScalar values to avoid accumulation with unintended values*/
+          IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+          IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+          IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+          IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+          /* kx = 1, ky = 0 */
+          /* Extracting scalar integers for QMULs */
+          qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                       1);
+          qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                       3);
+          qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                       5);
+          qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                       7);
+
+          /* Aligned Vector Loads of coefficients */
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, (-(remCh1 + remCh2) * coeffPitch1) - coeffPitch2 + coeffPitch3);
+
+          /* Masking the qmulScalar values to avoid accumulation with unintended values*/
+          IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+          IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+          IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+          IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+          /* kx = 0, ky = 1 */
+          /* Extracting scalar integers for QMULs */
+          qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                         (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+          qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                         (IVP_MOVNX16_FROM2NX8(dvecData2)), 2);
+          qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                         (IVP_MOVNX16_FROM2NX8(dvecData2)), 4);
+          qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                         (IVP_MOVNX16_FROM2NX8(dvecData2)), 6);
+
+          /* Aligned Vector Loads of coefficients */
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (remCh1 + remCh2) * coeffPitch1);
+
+          /* Masking the qmulScalar values to avoid accumulation with unintended values*/
+          IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+          IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+          IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+          IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+          /* kx = 1, ky = 1*/
+          /* Extracting scalar integers for QMULs */
+          qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                       1);
+          qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                       3);
+          qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                       5);
+          qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                       7);
+
+          /* Aligned Vector Loads of coefficients */
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, 0);
+
+          /* Masking the qmulScalar values to avoid accumulation with unintended values*/
+          IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+          IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+          IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+          IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+        } /* End Input Channels Corner case Handling */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2 * numY) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * \
+                       numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolvedVQ3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for 3x3 MOD_WHD_DWH    */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate 3x3 MOD_WHD_DWH 3D     */
+/*               dilated convolution function and 3x3 MOD_WHD_DWH 3D VQ     */
+/*               dilated convolution function                               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 3x3xDxN                                     */
+/*               Input is in WHD and Output is in DWH format                */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 3);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                         \
+                    XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 2) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \
+                    "\nStride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) > 0),                                 \
+                    XAI_ERR_BADARG, "\nDilation = %hhu, value should be greater than zero", \
+                    XAI_CNN_CONV_GET_DILATION(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_TILE3D_EDGE(inTile, 1 + (XAI_CNN_CONV_GET_DILATION(param) - 1));
+    XAI_CHECK_CONSISTENCY_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+
+    if (XAI_CNN_CONV_GET_DILATION(param) > 1)
+    {
+      XAI_CHECK_ERROR(XAI_CNN_CONV_GET_STRIDE(param) == 1,                                                                                  \
+                      XAI_ERR_BADARG, "\nStride = %hhu, Dilation = %hhu\nWhen dilation parameter is more than 1 stride always has to be 1", \
+                      XAI_CNN_CONV_GET_STRIDE(param), XAI_CNN_CONV_GET_DILATION(param));
+    }
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+
+
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    if (numInCh > 1)
+    {
+      /* Max value of Gather Offset is (min(numInCh-1,3)*inDataPitch2 + stride + 2 * dilation) */
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_PITCH(inTile) <                                                                       \
+                      ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) -                                                            \
+                        2 * XAI_CNN_CONV_GET_DILATION(param)) / XT_MIN(numInCh - 1, 3)),                                        \
+                      XAI_ERR_BADARG, "\ndim2Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \
+                      XAI_TILE3D_GET_DIM2_PITCH(inTile),                                                                        \
+                      ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) -                                                            \
+                        2 * XAI_CNN_CONV_GET_DILATION(param)) / XT_MIN(numInCh - 1, 3)));
+    }
+  }
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+  const uint8_t dilation      = XAI_CNN_CONV_GET_DILATION(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+  const int32_t kSizeU      = XAI_TILE4D_GET_DIM3(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t dilatedKSize = dilation * (kSizeU - 1) + 1;
+
+  /* move to start of edge data only when input is already padded. */
+  pInData = &pInData[-((dilatedKSize / 2) * inDataPitch1 + (dilatedKSize / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+  valign vaOutData = IVP_ZALIGN();
+
+  /* Only 2 Gathers are used in this approach to get the
+   * Input Data for 4 Output Vectors. In each Gather,
+   * 24 elements are read, where each 12 of them correspond
+   * to one vector of Output along the width. To get the
+   * index values for the Gather, the following calculations
+   * are made.
+   */
+
+  /* Gather Index Calculations */
+  /* Sequence - 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ... */
+  xb_vecNx16 vecGather0123 = IVP_ANDNX16(IVP_SEQNX16(), 3);
+  xb_vecNx16 vecSelIdx     = IVP_SEQNX16();
+  /* To get the Select indexes as - 0 1 2 3 4...11 32 33 34 35 36.... */
+  IVP_ADDNX16T(vecSelIdx, vecSelIdx, (XCHAL_IVPN_SIMD_WIDTH - 12), IVP_NOTBN(IVP_LTRNI(12)));
+  /* To get - 0 0 0 0 d*1 d*1 d*1 d*1 d*2 d*2 d*2 d*2 d*3 d*3 d*3 d*3... */
+  xb_vecNx16U vecGatherOff = IVP_SRLINX16(IVP_SEQNX16(), 2);
+  vecGatherOff = IVP_MULNX16UPACKL(vecGatherOff, (uint16_t) dilation);
+  /* Sequence - 0 P2 2*P2 3*P2 d*1 P2+d*1 2*P2+d*1 3*P2+d*1 d*2 P2+d*2 2*P2+d*2 3*P2+d*2 .. */
+  IVP_MULANX16PACKL(vecGatherOff, vecGather0123, inDataPitch2);
+  vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride), \
+                             vecGatherOff, vecSelIdx);
+  /* Final Index Pattern is -
+   * 0 P2 2*P2 3*P2 d*1 P2+d*1 2*P2+d*1 3*P2+d*1 d*2 P2+d*2 2*P2+d*2 3*P2+d*2
+   * s s+P2 s+2*P2 s+3*P2 s+d*1 s+P2+d*1 s+2*P2+d*1 s+3*P2+d*1 s+2 s+P2+d*2 s+2*P2+d*2 s+3*P2+d*2*/
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecOut;
+  int8_t*     restrict pData1;
+  int8_t*     restrict pData2;
+
+  int32_t remInCh = numInCh & 3;
+
+  /*Generation of maskLut for handling cases when remInCh is not equal to 0   */
+  /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF  */
+  /*    if remInCh is equal to 2 then sumMask is 00FFFFFF  */
+  const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 };
+
+  uint8_t remCh1 = XT_SALT(2, remInCh + 1);
+  uint8_t remCh2 = XT_SALT(3, remInCh + 1);
+
+  uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2;
+
+#ifdef __XCC__
+  XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+  /* Unrolled by 2 along both Output Width and Output Height.
+   * Also, unrolled along Input Channels by 4 and completely
+   * along the Kernel Width. Gathers are used for loading Input Data.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output channels */
+  {                                                                     /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable used to handle the corner case of OutHeight being odd */
+      int32_t numY = XT_MIN(2, outH - y) - 1;
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        xb_vecNx16U vecGatherOff1;
+        xb_vecNx16U vecGatherOff2;
+
+        /* Variable used to handle the corner case of Output Width being odd */
+        int32_t numX = XT_MIN(2, outW - x) - 1;
+
+        /* Output, Input and Coefficient Data Pointers */
+        int8_t *pOut   = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int8_t *pData  = pInData + (x * stride) + (y * stride) * inDataPitch1;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Boolean vectors to handle the corner cases of Out Width and Height being odd */
+        vboolN vbX = IVP_LTRSN(12 * (numX + 1));
+        vboolN vbY = IVP_LTRSN(12 * (numX + 1) * numY);
+
+        for (ky = 0; ky < 3; ky++) /* Kernel Height Loop */
+        {
+          /* Pointer for Input Data Load */
+          pData1 = pData + ky * dilation * inDataPitch1;
+          pData2 = pData1 + (stride * inDataPitch1 * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+          /* Assign valid address for predicated false lines */
+          vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, vbX);
+          vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, vbY);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels Loop */
+          {
+            /* Gather Input Data */
+            xb_gsr gather1       = IVP_GATHERANX8S(pData1, vecGatherOff1);
+            xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1);
+            xb_gsr gather2       = IVP_GATHERANX8S(pData2, vecGatherOff2);
+            xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2);
+
+            pData1 += 4 * inDataPitch2;
+            pData2 += 4 * inDataPitch2;
+
+            /* kx = 1 */
+            /* Extracting scalar integers for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 3);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 3);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - \
+                                                 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 2 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         4);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 3 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         5);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 - 2 * coeffPitch2);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          } /* End Input Channels */
+
+          /* Handling Corner cases of Number of Input Channels not being multiple of 4 */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh  = numInCh - inCh;
+            vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh);
+
+            /* Gather Input Data */
+            xb_vec2Nx8 dvecData1 = 0;
+            xb_vec2Nx8 dvecData2 = 0;
+            /* Assign valid address for predicated false lines */
+            vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbX));
+            vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbY));
+
+            xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1);
+            dvecData1 = IVP_GATHERD2NX8_L(gather1);
+            xb_gsr gather2 = IVP_GATHERANX8S(pData2, vecGatherOff2);
+            dvecData2 = IVP_GATHERD2NX8_L(gather2);
+
+            /* kx = 1 */
+            /* Extracting scalar integers for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 3);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 3);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1;
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            xb_vec2Nx8 dvecCoeff2;
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            xb_vec2Nx8 dvecCoeff3;
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh2 + remCh1)));
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values*/
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 2 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         4);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2)));
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 3 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         5);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, 0);
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+          } /* End Input Channels Corner case Handling */
+        }   /* End Kernel Height Loop */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2 * numY) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * \
+                       numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolvedVQ3D_S_4x4_S8S8IXCa2_MOD_WHD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for 2x2 MOD_WHD_DWH    */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate 2x2 MOD_WHD_DWH 3D     */
+/*               dilated convolution function and 2x2 MOD_WHD_DWH 3D VQ     */
+/*               dilated convolution function                               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 4x4xDxN                                     */
+/*               Input is in WHD and Output is in DWH format                */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_4x4_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_4x4_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param)
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 4);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 2) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \
+                    "Stride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) > 0),                                 \
+                    XAI_ERR_BADARG, "\nDilation = %hhu, value should be greater than zero", \
+                    XAI_CNN_CONV_GET_DILATION(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_EDGES_MOD_WHD(inTile, coeffTile, param);
+    XAI_CHECK_CONSISTENCY_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    if (XAI_CNN_CONV_GET_DILATION(param) > 1)
+    {
+      XAI_CHECK_ERROR(XAI_CNN_CONV_GET_STRIDE(param) == 1,                                                                                  \
+                      XAI_ERR_BADARG, "\nStride = %hhu, Dilation = %hhu\nWhen dilation parameter is more than 1 stride always has to be 1", \
+                      XAI_CNN_CONV_GET_STRIDE(param), XAI_CNN_CONV_GET_DILATION(param));
+    }
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    if (numInCh > 1)
+    {
+      /* Max value of Gather Offset is (min(numInCh-1,3)*inDataPitch2 + stride + 3 * dilation) */
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_PITCH(inTile) <                                                                       \
+                      ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) - 3 * XAI_CNN_CONV_GET_DILATION(param)) /                    \
+                       XT_MIN(numInCh - 1, 3)),                                                                                 \
+                      XAI_ERR_BADARG, "\ndim2Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \
+                      XAI_TILE3D_GET_DIM2_PITCH(inTile),                                                                        \
+                      ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) - 3 * XAI_CNN_CONV_GET_DILATION(param)) /                    \
+                       XT_MIN(numInCh - 1, 3)));
+    }
+  }
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+  const uint8_t dilation      = XAI_CNN_CONV_GET_DILATION(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+  const int32_t kSizeU      = XAI_TILE4D_GET_DIM3(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t dilatedKSize = dilation * (kSizeU - 1) + 1;
+  int32_t leftEdge, topEdge;
+
+  if ((dilatedKSize % 2) != 0)
+  {
+    leftEdge = dilatedKSize / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKSize / 2) : ((dilatedKSize / 2) - 1);
+  }
+
+  if ((dilatedKSize % 2) != 0)
+  {
+    topEdge = dilatedKSize / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKSize / 2) : ((dilatedKSize / 2) - 1);
+  }
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+  valign vaOutData = IVP_ZALIGN();
+
+  /* Only 2 Gathers are used in this approach to get the
+   * Input Data for 4 Output Vectors. In each Gather,
+   * 32 elements are read, where each 16 of them correspond
+   * to one vector of Output along the width. To get the
+   * index values for the Gather, the following calculations
+   * are made.
+   */
+
+  /* Gather Index Calculations */
+  /* Sequence - 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ... */
+  xb_vecNx16 vecGather0123 = IVP_ANDNX16(IVP_SEQNX16(), 3);
+  xb_vecNx16 vecSelIdx     = IVP_SEQNX16();
+  /* To get the Select indexes as - 0 1 2 3 4...11 32 33 34 35 36.... */
+  IVP_ADDNX16T(vecSelIdx, vecSelIdx, 16, IVP_NOTBN(IVP_LTRNI(16)));
+  /* To get - 0 0 0 0 d*1 d*1 d*1 d*1 d*2 d*2 d*2 d*2 d*3 d*3 d*3 d*3... */
+  xb_vecNx16U vecGatherOff = IVP_SRLINX16(IVP_SEQNX16(), 2);
+  vecGatherOff = IVP_MULNX16UPACKL(vecGatherOff, (uint16_t) dilation);
+  /* Sequence - 0 P2 2*P2 3*P2 d*1 P2+d*1 2*P2+d*1 3*P2+d*1 d*2 P2+d*2 2*P2+d*2 3*P2+d*2 .. */
+  IVP_MULANX16PACKL(vecGatherOff, vecGather0123, inDataPitch2);
+  vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride), \
+                             vecGatherOff, vecSelIdx);
+
+  /* Final Index Pattern is -
+   * First 16 elements
+   * 0    P2      2*P2      3*P2
+   * d*1  P2+d*1  2*P2+d*1  3*P2+d*1
+   * d*2  P2+d*2  2*P2+d*2  3*P2+d*2
+   * d*3  P2+d*3  2*P2+d*3  3*P2+d*3
+   *
+   * Last 16 elements
+   * s      s+P2      s+2*P2      s+3*P2
+   * s+d*1  s+P2+d*1  s+2*P2+d*1  s+3*P2+d*1
+   * s+d*2  s+P2+d*2  s+2*P2+d*2  s+3*P2+d*2
+   * s+d*3  s+P2+d*3  s+2*P2+d*3  s+3*P2+d*3
+   */
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+  xb_vec2Nx8* restrict pdvecCoeff3;
+  xb_vec2Nx8* restrict pdvecCoeff4;
+  xb_vec2Nx8* restrict pdvecOut;
+  int8_t*     restrict pData1;
+  int8_t*     restrict pData2;
+
+
+  int32_t remInCh = numInCh & 3;
+
+  /*Generation of maskLut for handling cases when remInCh is not equal to 0   */
+  /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF  */
+  /*    if remInCh is equal to 2 then sumMask is 00FFFFFF  */
+  const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 };
+
+  uint8_t remCh1 = XT_SALT(2, remInCh + 1);
+  uint8_t remCh2 = XT_SALT(3, remInCh + 1);
+
+  uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2;
+
+#ifdef __XCC__
+  XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+  /* Unrolled by 2 along both Output Width and Output Height.
+   * Also, unrolled along Input Channels by 4 and completely
+   * along the Kernel Width. Gathers are used for loading Input Data.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output channels */
+  {                                                                     /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable used to handle the corner case of OutHeight being odd */
+      int32_t numY = XT_MIN(2, outH - y) - 1;
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        xb_vecNx16U vecGatherOff1;
+        xb_vecNx16U vecGatherOff2;
+
+        /* Variable used to handle the corner case of Output Width being odd */
+        int32_t numX = XT_MIN(2, outW - x) - 1;
+
+        /* Output, Input and Coefficient Data Pointers */
+        int8_t *pOut   = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int8_t *pData  = pInData + (x * stride) + (y * stride) * inDataPitch1;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Boolean vectors to handle the corner cases of Out Width and Height being odd */
+        vboolN vbX = IVP_LTRSN(16 * (numX + 1));
+        vboolN vbY = IVP_LTRSN(16 * (numX + 1) * numY);
+
+        for (ky = 0; ky < 4; ky++) /* Kernel Height Loop */
+        {
+          /* Pointer for Input Data Load */
+          pData1 = pData + ky * dilation * inDataPitch1;
+          pData2 = pData1 + (stride * inDataPitch1 * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3 + coeffPitch2);
+          pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3 + 2 * coeffPitch2);
+          pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3 + 3 * coeffPitch2);
+          /* Assign valid address for predicated false lines */
+          vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, vbX);
+          vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, vbY);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels Loop */
+          {
+            /* Gather Input Data */
+            xb_gsr gather1       = IVP_GATHERANX8S(pData1, vecGatherOff1);
+            xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1);
+            xb_gsr gather2       = IVP_GATHERANX8S(pData2, vecGatherOff2);
+            xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2);
+
+            pData1 += 4 * inDataPitch2;
+            pData2 += 4 * inDataPitch2;
+
+            /* kx = 1 */
+            /* Extracting scalar integers for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 4);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 4);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff1, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 2 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         5);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff2, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff2, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 3 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         6);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         6);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff3, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff3, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff3, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff3, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 4 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         3);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         7);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         3);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         7);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff4, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff4, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff4, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff4, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          } /* End Input Channels */
+
+          /* Handling Corner cases of Number of Input Channels not being multiple of 4 */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh  = numInCh - inCh;
+            vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh);
+
+            /* Gather Input Data */
+            xb_vec2Nx8 dvecData1 = 0;
+            xb_vec2Nx8 dvecData2 = 0;
+            /* Assign valid address for predicated false lines */
+            vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbX));
+            vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbY));
+
+            xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1);
+            dvecData1 = IVP_GATHERD2NX8_L(gather1);
+            xb_gsr gather2 = IVP_GATHERANX8S(pData2, vecGatherOff2);
+            dvecData2 = IVP_GATHERD2NX8_L(gather2);
+
+            /* kx = 1 */
+            /* Extracting scalar integers for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 4);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 4);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1;
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1 * remCh1);
+            xb_vec2Nx8 dvecCoeff2;
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1 * remCh2);
+            xb_vec2Nx8 dvecCoeff3;
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, 0);
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 2 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         5);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff2, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, 0);
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 3 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         6);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         6);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff3, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff3, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff3, 0);
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 4 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         3);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         7);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         3);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         7);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff4, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff4, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff4, 0);
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+          } /* End Input Channels Corner case Handling */
+        }   /* End Kernel Height Loop */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2 * numY) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * \
+                       numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolvedVQ3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for 5x5 MOD_WHD_DWH    */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate 5x5 MOD_WHD_DWH 3D     */
+/*               dilated convolution function and 5x5 MOD_WHD_DWH 3D VQ     */
+/*               dilated convolution function                               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 5x5xDxN                                     */
+/*               Input is in WHD and Output is in DWH format                */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 5);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                         \
+                    XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 2) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \
+                    "\nStride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) > 0),                                 \
+                    XAI_ERR_BADARG, "\nDilation = %hhu, value should be greater than zero", \
+                    XAI_CNN_CONV_GET_DILATION(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_TILE3D_EDGE(inTile, 2 + 2 * (XAI_CNN_CONV_GET_DILATION(param) - 1));
+    XAI_CHECK_CONSISTENCY_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    if (XAI_CNN_CONV_GET_DILATION(param) > 1)
+    {
+      XAI_CHECK_ERROR(XAI_CNN_CONV_GET_STRIDE(param) == 1,                                                                                  \
+                      XAI_ERR_BADARG, "\nStride = %hhu, Dilation = %hhu\nWhen dilation parameter is more than 1 stride always has to be 1", \
+                      XAI_CNN_CONV_GET_STRIDE(param), XAI_CNN_CONV_GET_DILATION(param));
+    }
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    if (numInCh > 1)
+    {
+      /* Max value of Gather Offset is (min(numInCh-1,3)*inDataPitch2 + stride + 4 * dilation) */
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_PITCH(inTile) <                                                                       \
+                      ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) - 4 * XAI_CNN_CONV_GET_DILATION(param)) /                    \
+                       XT_MIN(numInCh - 1, 3)),                                                                                 \
+                      XAI_ERR_BADARG, "\ndim2Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \
+                      XAI_TILE3D_GET_DIM2_PITCH(inTile),                                                                        \
+                      ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) - 4 * XAI_CNN_CONV_GET_DILATION(param)) /                    \
+                       XT_MIN(numInCh - 1, 3)));
+    }
+  }
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+  const uint8_t dilation      = XAI_CNN_CONV_GET_DILATION(param);
+  const int32_t kSizeU        = XAI_TILE4D_GET_DIM3(coeffTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t dilatedKSize = dilation * (kSizeU - 1) + 1;
+
+  /* move to start of edge data only when input is already padded. */
+  pInData = &pInData[-((dilatedKSize / 2) * inDataPitch1 + (dilatedKSize / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+
+
+  /* 4 Gathers are being used to Load Input Data. Many common elements
+   * will be loaded in separate Gathers, especially in the case of
+   * stride 1 and 2. To take the advantage of having common offsets in
+   * a single Gather, 2 Gather Patterns are generated as given below.
+   * For example, in the Gather Patterns generated below,
+   * if stride is 1 and dilation equal to 1, then 8 offsets are common and if stride is 2, 4 offsets
+   * are common in each Gather.
+   */
+
+  /* Gather Index Calculations */
+  xb_vecNx16 vecGather = IVP_SRLINX16(IVP_SEQNX16(), 2);
+  vecGather = IVP_MULNX16UPACKL(vecGather, (uint16_t) dilation);
+  IVP_MULANX16PACKL(vecGather, inDataPitch2, IVP_ANDNX16(IVP_SEQNX16(), 3));
+  xb_vecNx16 vecGather1 = IVP_ADDNX16(vecGather, stride);
+
+  xb_vecNx16 vecSelIdx1 = IVP_SEQNX16();
+  IVP_ADDNX16T(vecSelIdx1, vecSelIdx1, (XCHAL_IVPN_SIMD_WIDTH - 12), IVP_NOTBN(IVP_LTRNI(12)));
+  xb_vecNx16U vecGatherOff1 = IVP_SELNX16(vecGather1, vecGather, vecSelIdx1);
+  xb_vecNx16 vecSelIdx2     = IVP_ADDNX16(IVP_SEQNX16(), 12);
+  IVP_ADDNX16T(vecSelIdx2, vecSelIdx2, (XCHAL_IVPN_SIMD_WIDTH - 12), IVP_NOTBN(IVP_LTRNI(8)));
+  xb_vecNx16U vecGatherOff2 = IVP_SELNX16(vecGather1, vecGather, vecSelIdx2);
+  /* Index Pattern of vecGatherOff1 is -
+   * 0 P2 2*P2 3*P2 d*1 P2+d*1 2*P2+d*1 3*P2+d*1 d*2 P2+d*2 2*P2+d*2 3*P2+d*2
+   * s s+P2 s+2*P2 s+3*P2 s+d*1 s+d*1+P2 s+d*1+2*P2 s+d*1+3*P2 */
+
+  /* Index Pattern of vecGatherOff2 is -
+   * d*3 P2+d*3 2*P2+d*3 3*P2+d*3 d*4 P2+d*4 2*P2+d*4 3*P2+d*4 s+d*2 s+d*2+P2 s+d*2+2*P2 s+d*2+3*P2
+   * s+d*3 s+d*3+P2 s+d*3+2*P2 s+d*3+3*P2 s+d*4 s+d*4+P2 s+d*4+2*P2 s+d*4+3*P2 */
+
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecOut;
+  int8_t*     restrict pData1;
+  int8_t*     restrict pData2;
+
+  int32_t remInCh = numInCh & 3;
+
+  /*Generation of maskLut for handling cases when remInCh is not equal to 0   */
+  /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF  */
+  /*    if remInCh is equal to 2 then sumMask is 00FFFFFF  */
+  const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 };
+
+  uint32_t sumMask = maskLut[0] + maskLut[1] * XT_SALT(2, remInCh + 1) + maskLut[2] * XT_SALT(3, remInCh + 1);
+
+
+#ifdef __XCC__
+  XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+  /* 4 Gathers used for Input Data Load. Unrolled along */
+  /* Output Width and Height by 2. Also, unrolled along */
+  /* Input Channels by 4 and Kernel Width.              */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Out Channels */
+  {                                                                     /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable used for corner case handling of Out Height odd */
+      int32_t numY = XT_MIN(2, outH - y) - 1;
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        xb_vecNx16U vecGatherOff00;
+        xb_vecNx16U vecGatherOff01;
+        xb_vecNx16U vecGatherOff10;
+        xb_vecNx16U vecGatherOff11;
+
+        /* Variable used for corner case handling of Out Width odd */
+        int32_t numX = XT_MIN(2, outW - x) - 1;
+
+        /* Output, Input and Coefficient Data Pointers */
+        int8_t *pOut   = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int8_t *pData  = pInData + (x * stride) + (y * stride) * inDataPitch1;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        /* Initialize accumulators with bias values */
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Boolean Vectors for Predicate Gather with corner cases  */
+        /* handled for Out Width and Height being odd numbers      */
+        vboolN vb1 = IVP_ORBN(IVP_LTRNI(12), IVP_LTRSN(20 * numX));
+        vboolN vb2 = IVP_ORBN(IVP_LTRNI(8), IVP_LTRSN(20 * numX));
+        vboolN vb3 = IVP_ANDBN(vb1, IVP_LTRSN(20 * numY));
+        vboolN vb4 = IVP_ANDBN(vb2, IVP_LTRSN(20 * numY));
+
+        for (ky = 0; ky < 5; ky++) /* Kernel Height */
+        {
+          /* Pointer for Input Data Load */
+          pData1 = pData + ky * dilation * inDataPitch1;
+          pData2 = pData1 + (stride * inDataPitch1 * numY);
+          /* Assign valid address for predicated false lines */
+          vecGatherOff00 = IVP_MOVNX16UT(vecGatherOff1, 0, vb1);
+          vecGatherOff01 = IVP_MOVNX16UT(vecGatherOff2, 0, vb2);
+          vecGatherOff10 = IVP_MOVNX16UT(vecGatherOff1, 0, vb3);
+          vecGatherOff11 = IVP_MOVNX16UT(vecGatherOff2, 0, vb4);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */
+          {
+            /* Gather Load of Input Data */
+            xb_gsr gather1       = IVP_GATHERANX8S(pData1, vecGatherOff00);
+            xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1);
+            xb_gsr gather2       = IVP_GATHERANX8S(pData1, vecGatherOff01);
+            xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2);
+            xb_gsr gather3       = IVP_GATHERANX8S(pData2, vecGatherOff10);
+            xb_vec2Nx8 dvecData3 = IVP_GATHERD2NX8_L(gather3);
+            xb_gsr gather4       = IVP_GATHERANX8S(pData2, vecGatherOff11);
+            xb_vec2Nx8 dvecData4 = IVP_GATHERD2NX8_L(gather4);
+
+            pData1 += 4 * inDataPitch2;
+            pData2 += 4 * inDataPitch2;
+
+            /* kx = 1 */
+            /* Extracting scalars for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 3);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 3);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * \
+                                                 coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 2 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \
+                                         4);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 3 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         2);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \
+                                         2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         2);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 4 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         0);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         3);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         0);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         3);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 5 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         4);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 - 4 * coeffPitch2);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          } /* End Input Channels */
+
+          /* Handling Corner cases of Number of Input Channels not being multiple of 4 */
+          if (remInCh)
+          {
+            vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh);
+            /* Assign valid address for predicated false lines */
+            vecGatherOff00 = IVP_MOVNX16UT(vecGatherOff1, 0, IVP_ANDBN(vbRemInCh, vb1));
+            vecGatherOff01 = IVP_MOVNX16UT(vecGatherOff2, 0, IVP_ANDBN(vbRemInCh, vb2));
+            vecGatherOff10 = IVP_MOVNX16UT(vecGatherOff1, 0, IVP_ANDBN(vbRemInCh, vb3));
+            vecGatherOff11 = IVP_MOVNX16UT(vecGatherOff2, 0, IVP_ANDBN(vbRemInCh, vb4));
+
+            /* Gather Input Data */
+            xb_vec2Nx8 dvecData1 = 0;
+            xb_vec2Nx8 dvecData2 = 0;
+            xb_vec2Nx8 dvecData3 = 0;
+            xb_vec2Nx8 dvecData4 = 0;
+            xb_gsr gather1       = IVP_GATHERANX8S(pData1, vecGatherOff00);
+            dvecData1 = IVP_GATHERD2NX8_L(gather1);
+            xb_gsr gather2 = IVP_GATHERANX8S(pData1, vecGatherOff01);
+            dvecData2 = IVP_GATHERD2NX8_L(gather2);
+            xb_gsr gather3 = IVP_GATHERANX8S(pData2, vecGatherOff10);
+            dvecData3 = IVP_GATHERD2NX8_L(gather3);
+            xb_gsr gather4 = IVP_GATHERANX8S(pData2, vecGatherOff11);
+            dvecData4 = IVP_GATHERD2NX8_L(gather4);
+
+            /* kx = 1 */
+            /* Extracting scalars for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 3);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 3);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1;
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * XT_SALT(2, remInCh + 1));
+            xb_vec2Nx8 dvecCoeff2;
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * XT_SALT(3, \
+                                                                        remInCh + 1));
+            xb_vec2Nx8 dvecCoeff3;
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - coeffPitch1 * (XT_SALT(2, remInCh + 1) + XT_SALT(3, remInCh + 1)));
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 2 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \
+                                         4);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * XT_SALT(2, remInCh + 1));
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * XT_SALT(3, remInCh + 1));
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - coeffPitch1 * (XT_SALT(2, remInCh + 1) + XT_SALT(3, remInCh + 1)));
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 3 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         2);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \
+                                         2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         2);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * XT_SALT(2, remInCh + 1));
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * XT_SALT(3, remInCh + 1));
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - coeffPitch1 * (XT_SALT(2, remInCh + 1) + XT_SALT(3, remInCh + 1)));
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 4 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         0);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         3);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         0);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         3);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * XT_SALT(2, remInCh + 1));
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * XT_SALT(3, remInCh + 1));
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (XT_SALT(2, remInCh + 1) + XT_SALT(3, remInCh + 1))));
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 5 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         4);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * XT_SALT(2, remInCh + 1));
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * XT_SALT(3, remInCh + 1));
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, 0);
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+          } /* End Input Channels corner case handling */
+        }   /* End Kernel Height */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2 * numY) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * \
+                       numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolvedVQ3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for 7x7 MOD_WHD_DWH    */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate 7x7 MOD_WHD_DWH 3D     */
+/*               dilated convolution function and 7x7 MOD_WHD_DWH 3D VQ     */
+/*               dilated convolution function                               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 7x7xDxN                                     */
+/*               Input is in WHD and Output is in DWH format                */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 7);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                         \
+                    XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 2) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \
+                    "\nStride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) == 1) ||                                                          \
+                    ((XAI_CNN_CONV_GET_DILATION(param) >= 1) &&                                                         \
+                     (XAI_CNN_CONV_GET_STRIDE(param) == 1)), XAI_ERR_BADARG,                                            \
+                    "\nDilation = %hhu\nDilation should be 1. It can be greater than 1 only when stride is equal to 1", \
+                    XAI_CNN_CONV_GET_DILATION(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_TILE3D_EDGE(inTile, 3 + 3 * (XAI_CNN_CONV_GET_DILATION(param) - 1));
+    XAI_CHECK_CONSISTENCY_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    if (numInCh > 1)
+    {
+      /* Max value of Gather Offset is (min(numInCh-1,3)*inDataPitch2 + stride + 6*dilation) */
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_PITCH(inTile) <                                                                             \
+                      ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) - 6 * XAI_CNN_CONV_GET_DILATION(param)) / XT_MIN(numInCh - 1, 3)), \
+                      XAI_ERR_BADARG, "dim2Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d",         \
+                      XAI_TILE3D_GET_DIM2_PITCH(inTile),                                                                              \
+                      ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) - 6 * XAI_CNN_CONV_GET_DILATION(param)) / XT_MIN(numInCh - 1, 3)));
+    }
+  }
+
+  /* Kernel Size (NDWH) */
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM3(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+  const uint8_t dilation      = XAI_CNN_CONV_GET_DILATION(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Effective Kernel size = dilation(KernelSize - 1) + 1                */
+  /* Effective kernel size is used for calculating the min required edge */
+  int32_t dilatedKSizeU = dilation * (kSizeU - 1) + 1;
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((dilatedKSizeU / 2) * inDataPitch1 + (dilatedKSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+
+  /* 4 Gathers are being used to Load Input Data. Many common elements
+   * will be loaded in separate Gathers, especially in the case of
+   * stride 1 and 2. To take the advantage of having common offsets in
+   * a single Gather, 2 Gather Patterns are generated as given below.
+   * For example, in the Gather Patterns generated below,
+   * if stride is 1 and dilation = 1, then 12 offsets are common and
+   * if stride is 2 and dilation = 1, 8 offsets are common in each Gather.
+   */
+  /* Gather Index Calculations */
+  xb_vecNx16 vecGather = IVP_MULNX16PACKL(dilation, IVP_SRLINX16(IVP_SEQNX16(), 2));
+  IVP_MULANX16PACKL(vecGather, inDataPitch2, IVP_ANDNX16(IVP_SEQNX16(), 3));
+  xb_vecNx16 vecGather1 = IVP_ADDNX16(vecGather, stride);
+
+  xb_vecNx16 vecSelIdx1 = IVP_SEQNX16();
+  IVP_ADDNX16T(vecSelIdx1, vecSelIdx1, (XCHAL_IVPN_SIMD_WIDTH - 16), IVP_NOTBN(IVP_LTRNI(16)));
+  xb_vecNx16U vecGatherOff1 = IVP_SELNX16(vecGather1, vecGather, vecSelIdx1);
+  xb_vecNx16 vecSelIdx2     = IVP_ADDNX16(IVP_SEQNX16(), 16);
+  IVP_ADDNX16T(vecSelIdx2, vecSelIdx2, (XCHAL_IVPN_SIMD_WIDTH - 16), IVP_NOTBN(IVP_LTRNI(12)));
+  xb_vecNx16U vecGatherOff2 = IVP_SELNX16(vecGather1, vecGather, vecSelIdx2);
+
+  /* Index Pattern of vecGatherOff1 is -
+   * 0 P2 2*P2 3*P2 1*d P2+1*d 2*P2+1*d 3*P2+1*d 2*d P2+2*d 2*P2+2*d 3*P2+2*d 3*d P2+3*d 2*P2+3*d 3*P2+3*d
+   * s s+P2 s+2*P2 s+3*P2 s+1*d s+1*d+P2 s+1*d+2*P2 s+1*d+3*P2 s+2*d s+2*d+P2 s+2*d+2*P2 s+2*d+3*P2 */
+
+  /* Index Pattern of vecGatherOff2 is -
+   * 4*d P2+4*d 2*P2+4*d 3*P2+4*d 5*d P2+5*d 2*P2+5*d 3*P2+5*d 6*d P2+6*d 2*P2+6*d 3*P2+6*d s+3*d s+3*d+P2 s+3*d+2*P2 s+3*d+3*P2
+   * s+4*d s+4*d+P2 s+4*d+2*P2 s+4*d+3*P2 s+5*d s+5*d+P2 s+5*d+2*P2 s+5*d+3*P2 s+6*d s+6*d+P2 s+6*d+2*P2 s+6*d+3*P2   */
+
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecOut;
+  int8_t*     restrict pData1;
+  int8_t*     restrict pData2;
+
+  int32_t remInCh = numInCh & 3;
+
+  const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 };
+
+  uint8_t remCh1 = XT_SALT(2, remInCh + 1);
+  uint8_t remCh2 = XT_SALT(3, remInCh + 1);
+
+  uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2;
+
+#ifdef __XCC__
+  XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+  /* 4 Gathers are used for Input Data Load corresponding to 4         */
+  /* Output Vectors. Loop unrolled along Output Width and Height by 2. */
+  /* Also unrolled along Input Channels by 4 and Kernel Width.         */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */
+  {                                                                     /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable used for corner case handling of Out Height odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        xb_vecNx16U vecGatherOff00;
+        xb_vecNx16U vecGatherOff01;
+        xb_vecNx16U vecGatherOff10;
+        xb_vecNx16U vecGatherOff11;
+        /* Variable used for corner case handling of Out Width odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output, Input and Coefficient Data Pointers */
+        int8_t *pOut   = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int8_t *pData  = pInData + (x * stride) + (y * stride) * inDataPitch1;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+
+        /* Boolean Vectors for Predicate Gather with corner cases  */
+        /* handled for Out Width and Height being odd numbers      */
+        vboolN vb1 = IVP_ORBN(IVP_LTRNI(16), IVP_LTRSN(28 * numX));
+        vboolN vb2 = IVP_ORBN(IVP_LTRNI(12), IVP_LTRSN(28 * numX));
+        vboolN vb3 = IVP_ANDBN(vb1, IVP_LTRSN(28 * numY));
+        vboolN vb4 = IVP_ANDBN(vb2, IVP_LTRSN(28 * numY));
+
+        for (ky = 0; ky < 7; ky++) /* Kernel Height */
+        {
+          /* Pointer for Input Data Load */
+          pData1 = pData + ky * inDataPitch1 * dilation;
+          pData2 = pData1 + (stride * inDataPitch1 * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+          /* Assign valid address for predicated false lines */
+          vecGatherOff00 = IVP_MOVNX16UT(vecGatherOff1, 0, vb1);
+          vecGatherOff01 = IVP_MOVNX16UT(vecGatherOff2, 0, vb2);
+          vecGatherOff10 = IVP_MOVNX16UT(vecGatherOff1, 0, vb3);
+          vecGatherOff11 = IVP_MOVNX16UT(vecGatherOff2, 0, vb4);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Number of Input Channels */
+          {
+            /* Gathers for Input Loads */
+            xb_gsr gather1       = IVP_GATHERANX8S(pData1, vecGatherOff00);
+            xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1);
+            xb_gsr gather2       = IVP_GATHERANX8S(pData1, vecGatherOff01);
+            xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2);
+            xb_gsr gather3       = IVP_GATHERANX8S(pData2, vecGatherOff10);
+            xb_vec2Nx8 dvecData3 = IVP_GATHERD2NX8_L(gather3);
+            xb_gsr gather4       = IVP_GATHERANX8S(pData2, vecGatherOff11);
+            xb_vec2Nx8 dvecData4 = IVP_GATHERD2NX8_L(gather4);
+
+            pData1 += 4 * inDataPitch2;
+            pData2 += 4 * inDataPitch2;
+
+            /* kx = 1 */
+            /* Extracting Scalars for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 4);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 4);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 2 */
+            /* Extracting Scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 5);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 3 */
+            /* Extracting Scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 6);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 6);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 4 */
+            /* Extracting Scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 3);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 3);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 3);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 3);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 5 */
+            /* Extracting Scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 4);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 6 */
+            /* Extracting Scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 5);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 7 */
+            /* Extracting Scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 6);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 6);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 - 6 * coeffPitch2);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          } /* End Input Channels */
+
+          /* Handling Corner cases of Number of Input Channels not being multiple of 4 */
+          if (remInCh)
+          {
+            vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh);
+            /* Assign valid address for predicated false lines */
+            vecGatherOff00 = IVP_MOVNX16UT(vecGatherOff1, 0, IVP_ANDBN(vbRemInCh, vb1));
+            vecGatherOff01 = IVP_MOVNX16UT(vecGatherOff2, 0, IVP_ANDBN(vbRemInCh, vb2));
+            vecGatherOff10 = IVP_MOVNX16UT(vecGatherOff1, 0, IVP_ANDBN(vbRemInCh, vb3));
+            vecGatherOff11 = IVP_MOVNX16UT(vecGatherOff2, 0, IVP_ANDBN(vbRemInCh, vb4));
+
+            /* Gather Input Data */
+            xb_vec2Nx8 dvecData1 = 0;
+            xb_vec2Nx8 dvecData2 = 0;
+            xb_vec2Nx8 dvecData3 = 0;
+            xb_vec2Nx8 dvecData4 = 0;
+            xb_gsr gather1       = IVP_GATHERANX8S(pData1, vecGatherOff00);
+            dvecData1 = IVP_GATHERD2NX8_L(gather1);
+            xb_gsr gather2 = IVP_GATHERANX8S(pData1, vecGatherOff01);
+            dvecData2 = IVP_GATHERD2NX8_L(gather2);
+            xb_gsr gather3 = IVP_GATHERANX8S(pData2, vecGatherOff10);
+            dvecData3 = IVP_GATHERD2NX8_L(gather3);
+            xb_gsr gather4 = IVP_GATHERANX8S(pData2, vecGatherOff11);
+            dvecData4 = IVP_GATHERD2NX8_L(gather4);
+
+
+            /* kx = 1 */
+            /* Extracting scalars for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 4);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 4);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1;
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            xb_vec2Nx8 dvecCoeff2;
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            xb_vec2Nx8 dvecCoeff3;
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2)));
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 2 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 5);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2)));
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 3 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 6);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 6);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2)));
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 4 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 3);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 3);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 3);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 3);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2)));
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 5 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 4);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2)));
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 6 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 5);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2)));
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 7 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 6);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 6);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, 0);
+
+            /* Masking the qmulScalar values to avoid accumulation with unintended values */
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+          } /* End Input Channels corner case handling */
+        }   /* End Kernel Height */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2 * numY) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for MxN MOD_WHD_DWH    */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate MxN MOD_WHD_DWH 3D     */
+/*               dilated convolution function and MxN MOD_WHD_DWH 3D VQ     */
+/*               dilated convolution function                               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structur        */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxN                                     */
+/*               Input is in WHD and Output is in DWH format                */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                         \
+                    XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 16) &&                                                                         \
+                    (XAI_TILE4D_GET_DIM4(coeffTile) <= 16),                                                                           \
+                    XAI_ERR_KSIZE, "\nKernel height = %d and width = %d\nKernel width and height should be less than or equal to 16", \
+                    XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile));
+    XAI_CHECK_EDGES_MOD_WHD(inTile, coeffTile, param);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 2) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \
+                    "\nStride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) == 1) ||                                                           \
+                    ((XAI_CNN_CONV_GET_DILATIONX(param) >= 1) &&                                                          \
+                     (XAI_CNN_CONV_GET_STRIDE(param) == 1)), XAI_ERR_BADARG,                                              \
+                    "\nDilationX = %hhu\nDilationX should be 1. It can be greater than 1 only when stride is equal to 1", \
+                    XAI_CNN_CONV_GET_DILATIONX(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONY(param) == 1) ||                                                           \
+                    ((XAI_CNN_CONV_GET_DILATIONY(param) >= 1) &&                                                          \
+                     (XAI_CNN_CONV_GET_STRIDE(param) == 1)), XAI_ERR_BADARG,                                              \
+                    "\nDilationY = %hhu\nDilationY should be 1. It can be greater than 1 only when stride is equal to 1", \
+                    XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_CONSISTENCY_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t outW      = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH      = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh   = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh  = XAI_TILE3D_GET_DIM1(outTile);
+  const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    if (numInCh > 1)
+    {
+      /* Max value of Gather Offset is (min(numInCh-1,7)*inDataPitch + stride*min(3,outWidth-1)) */
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_PITCH(inTile) <                                                                       \
+                      ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) * XT_MIN(3, outW - 1)) / XT_MIN(numInCh - 1, 7)),            \
+                      XAI_ERR_BADARG, "\ndim2Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \
+                      XAI_TILE3D_GET_DIM2_PITCH(inTile),                                                                        \
+                      ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) * XT_MIN(3, outW - 1)) / XT_MIN(numInCh - 1, 7)));
+    }
+  }
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU   = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU  = XAI_TILE4D_GET_DIM4(coeffTile);
+  int32_t dilatedkWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedkHeightU = dilationY * (kHeightU - 1) + 1;
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideU       = XAI_CNN_CONV_GET_STRIDE(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t leftEdge, topEdge;
+  if ((dilatedkWidthU % 2) != 0)
+  {
+    leftEdge = dilatedkWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedkWidthU / 2) : ((dilatedkWidthU / 2) - 1);
+  }
+
+  if ((dilatedkHeightU % 2) != 0)
+  {
+    topEdge = dilatedkHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedkHeightU / 2) : ((dilatedkHeightU / 2) - 1);
+  }
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+  int32_t k;
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecOut;
+
+#ifdef __XCC__
+  XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+  /* The loop across kernel width and kernel height can be combined. In this  */
+  /* case the address offsets for input and coefficient need to be derived    */
+  /* from vector registers. These vector registers are initialized as follows */
+
+  xb_vecN_2x32v hvecCoeffAddrOffInit = IVP_PACKVRNRN_2X64W(IVP_MULN_2X16X32_0 \
+                                                             (IVP_MOVNX16_FROMN_2X32(IVP_SEQN_2X32()), coeffPitch2), 0);
+
+  xb_vecN_2x32v hvecInAddrOffInit = IVP_PACKVRNRN_2X64W(IVP_MULHN_2X16X32_1 \
+                                                          ((xb_vecNx16) dilationX, IVP_SEQN_2X32()), 16);
+
+  /* This implementation uses one gather operation to load 4 bytes of data each from 8 channels */
+
+  /*****    Gather Offset Computation (used inside InCh for-loop)    *****/
+  /*               InCh for-loop is executed when inCh>8                 */
+  /*                                                                     */
+  /* offset = pitch*[0 1 2 3 4 5 6 7 ... 0 1 2 3 4 5 6 7] +              */
+  /*         stride*[0 0 0 0 0 0 0 0 ... 3 3 3 3 3 3 3 3]                */
+  /*  where [0 0 0 0 0 0 0 0 ... 3 3 3 3 3 3 3 3] =>> column indices     */
+  /*        [0 1 2 3 4 5 6 7 ... 0 1 2 3 4 5 6 7] =>> channel indices    */
+  xb_vecNx16U vecOffsets0 = IVP_ADDNX16(IVP_MULNX16PACKL(IVP_ANDNX16(7, IVP_SEQNX16()), inDataPitch2), \
+                                        IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 3), strideU));
+
+  /*******  Gather Offset Computation and Coeff Mask (outside InCh for-loop)   ********/
+
+  /* ((numInCh>>3)<<3) = largest multiple of 8 less numInCh-8 */
+  /* Loop across inCh is executed only when numInCh > 8       */
+  int32_t remainingInCh = (numInCh - ((numInCh >> 3) << 3));
+  remainingInCh = remainingInCh != 0 ? remainingInCh : 8;
+
+  /* Generating Coefficient mask such that coefficient load happens only for valid channel number*/
+  /* Coefficient mask entries for channels greater than the remainingInCh are set to 0 */
+  uint8_t remCh1 = XT_SALT(1, remainingInCh);
+  uint8_t remCh2 = XT_SALT(2, remainingInCh);
+  uint8_t remCh3 = XT_SALT(3, remainingInCh);
+  uint8_t remCh4 = XT_SALT(4, remainingInCh);
+  uint8_t remCh5 = XT_SALT(5, remainingInCh);
+  uint8_t remCh6 = XT_SALT(6, remainingInCh);
+  uint8_t remCh7 = XT_SALT(7, remainingInCh);
+
+  /*Generation of maskLut for handling cases when remainingInCh is not equal to 0   */
+  /*eg. if remainingInCh is equal to 2 then sumMask1 is 00FFFFFF and sumMask2 is 0  */
+  /*    if remainingInCh is equal to 3 then sumMask1 is FFFFFFFF and sumMask2 is 0  */
+  /*    if remainingInCh is equal to 4 then sumMask1 is FFFFFFFF and sumMask2 is FF */
+  const uint32_t maskLut[4] = { 0xff, 0xff00, 0xff0000, 0xff000000 };
+
+  int32_t sumMask1 = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2 + maskLut[3] * remCh3;
+  int32_t sumMask2 = maskLut[0] * remCh4 + maskLut[1] * remCh5 + maskLut[2] * remCh6 + maskLut[3] * remCh7;
+
+  /* Finding the gather offset such that valid memory locations are accessed       */
+  /* [0 1 2 3 4 5 6 7 ... 0 1 2 3 4 5 6 7] in offset calculation is modified such  */
+  /* that columns greater than (remainingInCh-1) are set to (remainingInCh-1)      */
+  xb_vecNx16 vecRemainingInChIdx = IVP_MINNX16(IVP_ANDNX16(7, IVP_SEQNX16()), remainingInCh - 1);
+  xb_vecNx16U vecOffsets1        = IVP_ADDNX16(IVP_MULNX16PACKL(vecRemainingInChIdx, inDataPitch2), \
+                                               IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 3), strideU));
+
+  /**  Output width is unrolled by 4 and Input Channels is unrolled by 8 **/
+
+  /********* Loop Starts ************/
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Along output channels*/
+  {
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y++)   /* Along output height*/
+    {
+      xb_vecNx16U vecOffsets2;
+      xb_vecNx16U vecOffsets3;
+      for (x = 0; x < outW; x += 4)   /*Along output width*/
+      {
+        /*  For corner case handling  */
+        int32_t remainingX  = XT_MIN(4, outW - x);
+        vboolN vbOffsetMask = IVP_LTRSN(8 * remainingX);     /* 8 channels*/
+        /* Assign valid address for predicated false lines */
+        vecOffsets2 = IVP_MOVNX16UT(vecOffsets0, 0, vbOffsetMask);
+        vecOffsets3 = IVP_MOVNX16UT(vecOffsets1, 0, vbOffsetMask);
+
+        /*  Output pointer */
+        int8_t* pOut = &pOutData[(y * outDataPitch2 + x * outDataPitch1) * bytesPerPixel];
+
+        /* Loading bias and initializing sum with bias*/
+        xb_vec2Nx24 dvecSum0 = 0, dvecSum1 = 0, dvecSum2 = 0, dvecSum3 = 0;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, dvecSum0, dvecSum1, dvecSum2, dvecSum3);
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pSrc1  = pInData + x * strideU + y * strideU * inDataPitch1;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        xb_vecN_2x32v hvecInAddrOff    = hvecInAddrOffInit;
+        xb_vecN_2x32v hvecCoeffAddrOff = hvecCoeffAddrOffInit;
+        xb_vecN_2x32v hvecLaneIdx      = 0;
+        int32_t index, inAddrOff, coeffAddrOff;
+
+        for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */
+        {
+          /* Condition checks performed to get the Input and Coefficient        */
+          /* Pointer Offsets after combining the Kernel Width and Height Loops  */
+          vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU);
+          /* hvecLaneIdx will be reset to zero after every kWidth */
+          hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2);
+          /* InPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch1 * dilationY, vbN_2);
+          /* CoeffPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3, vbN_2);
+          index = IVP_EXTRN_2X32(hvecLaneIdx, 0);
+          /* Extracting Input and Coefficient address offsets */
+          inAddrOff    = IVP_EXTRVRN_2X32(hvecInAddrOff, 4 * index);
+          coeffAddrOff = IVP_EXTRVRN_2X32(hvecCoeffAddrOff, 4 * index);
+          hvecLaneIdx  = IVP_ADDN_2X32(hvecLaneIdx, 1);
+
+          /* Pointers for Input Data Loads */
+          int8_t *pSrc = (pSrc1 + inAddrOff);
+
+          /* Pointer for Coefficient Load */
+#ifdef IS_VISION_130
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff);
+          xb_vec2Nx8* pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff + 4 * coeffPitch1 + coeffAddrOff);
+
+          for (inCh = 0; inCh < (numInCh - 8); inCh += 8)
+          {
+            /* Gather Operation to load 8 channels of 1x4 block of input . dvecIn will contain data  */
+            /* from 8 channels corresponding to same x and y value in consecutive positions.         */
+            xb_gsr gatherReg  = IVP_GATHERANX8S(pSrc + inCh * inDataPitch2, vecOffsets2);
+            xb_vec2Nx8 dvecIn = IVP_GATHERD2NX8_L(gatherReg);  /* LSB 8 bits of gatherReg  contain the desired data*/
+
+            /* 8 Coefficient Vector Loads */
+            /* Load Coefficients to vector - coefficients already aligned  */
+            xb_vec2Nx8 dvecCoeff0;
+            IVP_L2U2NX8_XP(dvecCoeff0, pdvecCoeff, coeffPitch1);
+
+            xb_vec2Nx8 dvecCoeff1;
+            IVP_L2U2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+
+            xb_vec2Nx8 dvecCoeff2;
+            IVP_L2U2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+
+            xb_vec2Nx8 dvecCoeff3;
+            IVP_L2U2NX8_XP(dvecCoeff3, pdvecCoeff, 5 * coeffPitch1);
+
+            xb_vec2Nx8 dvecCoeff4;
+            IVP_L2U2NX8_XP(dvecCoeff4, pdvecCoeff1, coeffPitch1);
+
+            xb_vec2Nx8 dvecCoeff5;
+            IVP_L2U2NX8_XP(dvecCoeff5, pdvecCoeff1, coeffPitch1);
+
+            xb_vec2Nx8 dvecCoeff6;
+            IVP_L2U2NX8_XP(dvecCoeff6, pdvecCoeff1, coeffPitch1);
+
+            xb_vec2Nx8 dvecCoeff7;
+            IVP_L2U2NX8_XP(dvecCoeff7, pdvecCoeff1, 5 * coeffPitch1);
+
+            /* Load 4 bytes of input data along the depth to int32_t scalar */
+            int32_t scalarInData0 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecIn)), 0);
+            int32_t scalarInData1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecIn)), 1);
+
+            int32_t scalarInData2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecIn)), 2);
+            int32_t scalarInData3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecIn)), 3);
+
+            int32_t scalarInData4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecIn)), 4);
+            int32_t scalarInData5 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecIn)), 5);
+
+            int32_t scalarInData6 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecIn)), 6);
+            int32_t scalarInData7 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecIn)), 7);
+
+            /* Multiply and accumulate */
+            IVP_MULQA2N8XR8(dvecSum0, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData0);
+            IVP_MULQA2N8XR8(dvecSum1, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData2);
+            IVP_MULQA2N8XR8(dvecSum2, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData4);
+            IVP_MULQA2N8XR8(dvecSum3, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData6);
+
+            IVP_MULQA2N8XR8(dvecSum0, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData1);
+            IVP_MULQA2N8XR8(dvecSum1, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData3);
+            IVP_MULQA2N8XR8(dvecSum2, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData5);
+            IVP_MULQA2N8XR8(dvecSum3, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData7);
+          }  /* end of for(inCh = 0; inCh < (numInCh-8); inCh+=8)*/
+
+          /*Gather Operation to load remainingCh number of channels corresponding to 1x4 block   */
+          /*of input. The channels to be loaded are handled by vecOffsets1 */
+          xb_gsr gatherReg  = IVP_GATHERANX8S(pSrc + inCh * inDataPitch2, vecOffsets3);
+          xb_vec2Nx8 dvecIn = IVP_GATHERD2NX8_L(gatherReg); /* LSB 8 bits of gatherReg contain the desired data*/
+
+          /* Load 4 bytes of input data along the depth to int32_t scalar */
+          int32_t scalarInData0 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 0);
+          int32_t scalarInData1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 1);
+
+          int32_t scalarInData2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 2);
+          int32_t scalarInData3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 3);
+
+          int32_t scalarInData4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 4);
+          int32_t scalarInData5 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 5);
+
+          int32_t scalarInData6 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 6);
+          int32_t scalarInData7 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 7);
+
+          /* 8 Coefficient Vector Loads */
+          /* Load Coefficients to vector - coefficients already aligned  */
+          xb_vec2Nx8 dvecCoeff0;
+          IVP_LV2NX8_XP(dvecCoeff0, pdvecCoeff, coeffPitch1 * remCh1);
+
+          xb_vec2Nx8 dvecCoeff1;
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh2);
+
+          xb_vec2Nx8 dvecCoeff2;
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh3);
+
+          xb_vec2Nx8 dvecCoeff3;
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1 * remCh4);
+
+          xb_vec2Nx8 dvecCoeff4;
+          IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 * remCh5);
+
+          xb_vec2Nx8 dvecCoeff5;
+          IVP_LV2NX8_XP(dvecCoeff5, pdvecCoeff, coeffPitch1 * remCh6);
+
+          xb_vec2Nx8 dvecCoeff6;
+          IVP_LV2NX8_XP(dvecCoeff6, pdvecCoeff, coeffPitch1 * remCh7);
+
+          xb_vec2Nx8 dvecCoeff7;
+          IVP_LV2NX8_XP(dvecCoeff7, pdvecCoeff, coeffPitch1);
+
+          /* Multiply and accumulate */
+          IVP_MULQA2N8XR8(dvecSum0, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData0 & sumMask1);
+          IVP_MULQA2N8XR8(dvecSum1, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData2 & sumMask1);
+          IVP_MULQA2N8XR8(dvecSum2, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData4 & sumMask1);
+          IVP_MULQA2N8XR8(dvecSum3, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData6 & sumMask1);
+
+          IVP_MULQA2N8XR8(dvecSum0, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData1 & sumMask2);
+          IVP_MULQA2N8XR8(dvecSum1, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData3 & sumMask2);
+          IVP_MULQA2N8XR8(dvecSum2, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData5 & sumMask2);
+          IVP_MULQA2N8XR8(dvecSum3, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData7 & sumMask2);
+        } /* end of for (k = 0; k < kHeightU * kWidthU; k++)*/
+
+#else
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff);
+
+          for (inCh = 0; inCh < (numInCh - 8); inCh += 8)
+          {
+            /* Gather Operation to load 8 channels of 1x4 block of input . dvecIn will contain data  */
+            /* from 8 channels corresponding to same x and y value in consecutive positions.         */
+            xb_gsr gatherReg  = IVP_GATHERANX8S(pSrc + inCh * inDataPitch2, vecOffsets2);
+            xb_vec2Nx8 dvecIn = IVP_GATHERD2NX8_L(gatherReg);    /* LSB 8 bits of gatherReg  contain the desired data*/
+
+            /* 8 Coefficient Vector Loads */
+            /* Load Coefficients to vector - coefficients already aligned  */
+            xb_vec2Nx8 dvecCoeff0;
+            IVP_LV2NX8_XP(dvecCoeff0, pdvecCoeff, coeffPitch1);
+
+            xb_vec2Nx8 dvecCoeff1;
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+
+            xb_vec2Nx8 dvecCoeff2;
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+
+            xb_vec2Nx8 dvecCoeff3;
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+            xb_vec2Nx8 dvecCoeff4;
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+            xb_vec2Nx8 dvecCoeff5;
+            IVP_LV2NX8_XP(dvecCoeff5, pdvecCoeff, coeffPitch1);
+
+            xb_vec2Nx8 dvecCoeff6;
+            IVP_LV2NX8_XP(dvecCoeff6, pdvecCoeff, coeffPitch1);
+
+            xb_vec2Nx8 dvecCoeff7;
+            IVP_LV2NX8_XP(dvecCoeff7, pdvecCoeff, coeffPitch1);
+
+            /* Load 4 bytes of input data along the depth to int32_t scalar */
+            int32_t scalarInData0 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecIn)), 0);
+            int32_t scalarInData1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecIn)), 1);
+
+            int32_t scalarInData2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecIn)), 2);
+            int32_t scalarInData3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecIn)), 3);
+
+            int32_t scalarInData4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecIn)), 4);
+            int32_t scalarInData5 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecIn)), 5);
+
+            int32_t scalarInData6 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecIn)), 6);
+            int32_t scalarInData7 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecIn)), 7);
+
+            /* Multiply and accumulate */
+            IVP_MULQA2N8XR8(dvecSum0, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData0);
+            IVP_MULQA2N8XR8(dvecSum1, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData2);
+            IVP_MULQA2N8XR8(dvecSum2, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData4);
+            IVP_MULQA2N8XR8(dvecSum3, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData6);
+
+            IVP_MULQA2N8XR8(dvecSum0, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData1);
+            IVP_MULQA2N8XR8(dvecSum1, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData3);
+            IVP_MULQA2N8XR8(dvecSum2, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData5);
+            IVP_MULQA2N8XR8(dvecSum3, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData7);
+          }  /* end of for(inCh = 0; inCh < (numInCh-8); inCh+=8)*/
+
+          /*Gather Operation to load remainingCh number of channels corresponding to 1x4 block   */
+          /*of input. The channels to be loaded are handled by vecOffsets1 */
+          xb_gsr gatherReg  = IVP_GATHERANX8S(pSrc + inCh * inDataPitch2, vecOffsets3);
+          xb_vec2Nx8 dvecIn = IVP_GATHERD2NX8_L(gatherReg); /* LSB 8 bits of gatherReg contain the desired data*/
+
+          /* Load 4 bytes of input data along the depth to int32_t scalar */
+          int32_t scalarInData0 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 0);
+          int32_t scalarInData1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 1);
+
+          int32_t scalarInData2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 2);
+          int32_t scalarInData3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 3);
+
+          int32_t scalarInData4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 4);
+          int32_t scalarInData5 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 5);
+
+          int32_t scalarInData6 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 6);
+          int32_t scalarInData7 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecIn)), 7);
+
+          /* 8 Coefficient Vector Loads */
+          /* Load Coefficients to vector - coefficients already aligned  */
+          xb_vec2Nx8 dvecCoeff0;
+          IVP_LV2NX8_XP(dvecCoeff0, pdvecCoeff, coeffPitch1 * remCh1);
+
+          xb_vec2Nx8 dvecCoeff1;
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh2);
+
+          xb_vec2Nx8 dvecCoeff2;
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh3);
+
+          xb_vec2Nx8 dvecCoeff3;
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1 * remCh4);
+
+          xb_vec2Nx8 dvecCoeff4;
+          IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 * remCh5);
+
+          xb_vec2Nx8 dvecCoeff5;
+          IVP_LV2NX8_XP(dvecCoeff5, pdvecCoeff, coeffPitch1 * remCh6);
+
+          xb_vec2Nx8 dvecCoeff6;
+          IVP_LV2NX8_XP(dvecCoeff6, pdvecCoeff, coeffPitch1 * remCh7);
+
+          xb_vec2Nx8 dvecCoeff7;
+          IVP_LV2NX8_XP(dvecCoeff7, pdvecCoeff, coeffPitch1);
+
+          /* Multiply and accumulate */
+          IVP_MULQA2N8XR8(dvecSum0, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData0 & sumMask1);
+          IVP_MULQA2N8XR8(dvecSum1, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData2 & sumMask1);
+          IVP_MULQA2N8XR8(dvecSum2, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData4 & sumMask1);
+          IVP_MULQA2N8XR8(dvecSum3, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData6 & sumMask1);
+
+          IVP_MULQA2N8XR8(dvecSum0, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData1 & sumMask2);
+          IVP_MULQA2N8XR8(dvecSum1, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData3 & sumMask2);
+          IVP_MULQA2N8XR8(dvecSum2, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData5 & sumMask2);
+          IVP_MULQA2N8XR8(dvecSum3, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData7 & sumMask2);
+        } /* end of for (k = 0; k < kHeightU * kWidthU; k++)*/
+#endif
+
+        /* Storing output vector to memory */
+        xb_vec2Nx8 dvecOutData0L = 0, dvecOutData1L = 0, dvecOutData2L = 0, dvecOutData3L = 0;
+        xb_vec2Nx8 dvecOutData0H = 0, dvecOutData1H = 0, dvecOutData2H = 0, dvecOutData3H = 0;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData0L, dvecOutData0H, dvecSum0, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData1L, dvecOutData1H, dvecSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData2L, dvecOutData2H, dvecSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData3L, dvecOutData3H, dvecSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData0L, dvecOutData0H, dvecSum0, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData1L, dvecOutData1H, dvecSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData2L, dvecOutData2H, dvecSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData3L, dvecOutData3H, dvecSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        pdvecOut = (xb_vec2Nx8 *) &pOut[outCh * bytesPerPixel];
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOutData0L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOutData0H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 1)];
+        IVP_SAV2NX8_XP(dvecOutData1L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh * XT_SALT(0, remainingX - 1));
+        IVP_SAV2NX8_XP(dvecOutData1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 1));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + 2 * outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 2)];
+        IVP_SAV2NX8_XP(dvecOutData2L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh * XT_SALT(0, remainingX - 2));
+        IVP_SAV2NX8_XP(dvecOutData2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 2));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + 3 * outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 3)];
+        IVP_SAV2NX8_XP(dvecOutData3L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh * XT_SALT(0, remainingX - 3));
+        IVP_SAV2NX8_XP(dvecOutData3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 3));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* end of for(x = 0; x < outW; x+=4)*/
+    }   /* end of for(y = 0; y < outH; y++)*/
+  }     /* end of for(outCh = 0; outCh < numOutCh; outCh+=2*XCHAL_IVPN_SIMD_WIDTH)*/
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+* MOD DWH variants
+******************************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution for handling */
+/*               cases where kwidth * numInch is a multiple of 4            */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 15.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV
+static _XAI_INLINE_ void convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t leftEdge, topEdge;
+
+  if ((kWidthU % 2) != 0)
+  {
+    leftEdge = kWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1);
+  }
+
+  if ((kHeightU % 2) != 0)
+  {
+    topEdge = kHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1);
+  }
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+  int32_t numIter  = kWidthU * numInCh;
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecData1;
+  xb_vec2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecData3;
+  xb_vec2Nx8* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * numX);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + strideY * inDataPitch2 * numY);
+          pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8_PP(pdvecData4);
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+          for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution              */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 15.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV
+static _XAI_INLINE_ void convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t numIter = kWidthU * numInCh;
+
+  int32_t leftEdge, topEdge;
+  if ((kWidthU % 2) != 0)
+  {
+    leftEdge = kWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1);
+  }
+
+  if ((kHeightU % 2) != 0)
+  {
+    topEdge = kHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1);
+  }
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecData1;
+  xb_vec2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecData3;
+  xb_vec2Nx8* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * numX);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + strideY * inDataPitch2 * numY);
+          pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8_PP(pdvecData4);
+
+          for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          }   /* End Input Channels */
+
+          /* Corner case handling as numIter is not a multiple of 4 */
+          {
+            int32_t remInCh = numIter - k;
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2 * numY) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/******************************************************************************************
+* MOD DWH variants
+******************************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution for handling */
+/*               cases where kwidth * numInch is a multiple of 4            */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is U8, CoeffData is S8                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 15.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+#ifdef IVP_MULSUQA2N8XR8
+#ifdef DILATED_VQ_CONV
+static _XAI_INLINE_ void convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void convolved3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  uint8_t *pInData   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t leftEdge, topEdge;
+
+  if ((kWidthU % 2) != 0)
+  {
+    leftEdge = kWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1);
+  }
+
+  if ((kHeightU % 2) != 0)
+  {
+    topEdge = kHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1);
+  }
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+  int32_t numIter  = kWidthU * numInCh;
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8U* restrict pdvecData1;
+  xb_vec2Nx8U* restrict pdvecData2;
+  xb_vec2Nx8U* restrict pdvecData3;
+  xb_vec2Nx8U* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2);
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2);
+          pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * numX);
+          pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + strideY * inDataPitch2 * numY);
+          pdvecData4 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8U_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8U_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8U_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8U_PP(pdvecData4);
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+          for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8U dvecData4; IVP_LAV2NX8U_XP(dvecData4, vaData4, pdvecData4, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData4)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            xb_vec2Nx8U dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1)));
+            xb_vec2Nx8U dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2)));
+            xb_vec2Nx8U dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3)));
+            xb_vec2Nx8U dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4)));
+
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), dvecCoeff4);
+#endif
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+#endif
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution              */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is U8, CoeffData is S8                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 15.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+#ifdef IVP_MULSUQA2N8XR8
+#ifdef DILATED_VQ_CONV
+static _XAI_INLINE_ void convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void convolved3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  uint8_t *pInData   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t numIter = kWidthU * numInCh;
+
+  int32_t leftEdge, topEdge;
+  if ((kWidthU % 2) != 0)
+  {
+    leftEdge = kWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1);
+  }
+
+  if ((kHeightU % 2) != 0)
+  {
+    topEdge = kHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1);
+  }
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8U* restrict pdvecData1;
+  xb_vec2Nx8U* restrict pdvecData2;
+  xb_vec2Nx8U* restrict pdvecData3;
+  xb_vec2Nx8U* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2);
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2);
+          pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * numX);
+          pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + strideY * inDataPitch2 * numY);
+          pdvecData4 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8U_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8U_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8U_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8U_PP(pdvecData4);
+
+          for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8U dvecData4; IVP_LAV2NX8U_XP(dvecData4, vaData4, pdvecData4, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData4)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            xb_vec2Nx8U dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1)));
+            xb_vec2Nx8U dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2)));
+            xb_vec2Nx8U dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3)));
+            xb_vec2Nx8U dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4)));
+
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), dvecCoeff4);
+#endif
+          }   /* End Input Channels */
+
+          /* Corner case handling as numIter is not a multiple of 4 */
+          {
+            int32_t remInCh = numIter - k;
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, remInCh);
+            xb_vec2Nx8U dvecData4; IVP_LAV2NX8U_XP(dvecData4, vaData4, pdvecData4, remInCh);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData4)), 0);
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            xb_vec2Nx8U dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1)));
+            xb_vec2Nx8U dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2)));
+            xb_vec2Nx8U dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3)));
+            xb_vec2Nx8U dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4)));
+
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), 0);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), 0);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), 0);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), 0);
+#endif
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2 * numY) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+#endif
+
+/*****************************************************************************
+*  convolvedVQ3D_S_1x1_S8S8IXCa2_MOD_DWH_x4
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution for handling */
+/*               cases where kwidth * numInch is a multiple of 4            */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 1x1xDxN                                     */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV
+static _XAI_INLINE_ void convolvedVQ3D_S_1x1_S8S8IXCa2_MOD_DWH_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void convolved3D_S_1x1_S8S8IXCa2_MOD_DWH_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecData1;
+  xb_vec2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecData3;
+  xb_vec2Nx8* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  /* Unrolled by 2 along both Output Width and Height.
+   * Inner loop unrolled by 4 along the Input number of Channels.
+   * Input Number of Channels less than 4 handled in a
+   * separate loop.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */
+  {                                                                     /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = (numOutCh - outCh);
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Corner case Handling if height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Corner case Handling if width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Pointer for Coefficient Load */
+        int8_t *pCoeff = pCoeffData + outCh;
+        pdvecCoeff = (xb_vec2Nx8 *) pCoeff;
+
+        /* Input Data Pointers */
+        int8_t *pData = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2;
+        pdvecData1 = (xb_vec2Nx8 *) pData;
+        pdvecData2 = (xb_vec2Nx8 *) (pData + stride * inDataPitch1 * numX);
+        pdvecData3 = (xb_vec2Nx8 *) (pData + stride * inDataPitch2 * numY);
+        pdvecData4 = (xb_vec2Nx8 *) (pData + stride * (inDataPitch1 + inDataPitch2) * numX * numY);
+
+        valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+        valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+        valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+        valign vaData4 = IVP_LA2NX8_PP(pdvecData4);
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (inCh = 0; inCh < numInCh; inCh += 4) /* Input Channels */
+        {
+          /* Aligning variable vector load of pixels */
+          xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+          xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+          xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+          xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4);
+
+          /* Extracting first 4 bytes of vector into address register */
+          /* Scalar integers to be used for QMUL                      */
+          int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+          int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+          int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+          int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+          /* 4 Aligned Vector Loads of coefficients */
+          xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+          /* Quad Muls */
+          IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+          IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+          IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+        } /* End Input Channels */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        int8_t *pOut = pOutData + (outCh + x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        pdvecOut = (xb_vec2Nx8 *) pOut;
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pOut     = pOutData + (outCh + (x + 1) * outDataPitch1 + y * outDataPitch2) * bytesPerPixel * numX;
+        pdvecOut = (xb_vec2Nx8 *) pOut;
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pOut     = pOutData + (outCh + x * outDataPitch1 + (y + 1) * outDataPitch2) * bytesPerPixel * numY;
+        pdvecOut = (xb_vec2Nx8 *) pOut;
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pOut     = pOutData + (outCh + (x + 1) * outDataPitch1 + (y + 1) * outDataPitch2) * bytesPerPixel * numX * numY;
+        pdvecOut = (xb_vec2Nx8 *) pOut;
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/*****************************************************************************
+*  convolvedVQ3D_S_1x1_U8S8IXCa2_MOD_DWH_x4
+*  **************************************************************************/
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D convolution for handling */
+/*               cases where kwidth * numInch is a multiple of 4            */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is U8, CoeffData is S8                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 1x1xDxN                                     */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV
+static _XAI_INLINE_ void convolvedVQ3D_S_1x1_U8S8IXCa2_MOD_DWH_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void convolved3D_S_1x1_U8S8IXCa2_MOD_DWH_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  uint8_t *pInData   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, k, x, y;
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8U* restrict pdvecData1;
+  xb_vec2Nx8U* restrict pdvecData2;
+  xb_vec2Nx8U* restrict pdvecData3;
+  xb_vec2Nx8U* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  /* Vector data registers */
+  xb_vec2Nx8U dvecInData1, dvecInData2, dvecInData3, dvecInData4;
+  valign vaIn1, vaIn2, vaIn3, vaIn4;
+
+  /* Unrolled by 2 along both Output Width and Height.
+   * Inner loop unrolled by 4 along the Input number of Channels.
+   * Input Number of Channels less than 4 handled in a
+   * separate loop.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */
+  {                                                                     /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = (numOutCh - outCh);
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Corner case Handling if height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Corner case Handling if width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Pointer for Coefficient Load */
+        int8_t *pCoeff = pCoeffData + outCh;
+        pdvecCoeff = (xb_vec2Nx8 *) pCoeff;
+
+        /* Input Data Pointers */
+        uint8_t *pData = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2;
+        pdvecData1 = (xb_vec2Nx8U *) pData;
+        pdvecData2 = (xb_vec2Nx8U *) (pData + stride * inDataPitch1 * numX);
+        pdvecData3 = (xb_vec2Nx8U *) (pData + stride * inDataPitch2 * numY);
+        pdvecData4 = (xb_vec2Nx8U *) (pData + stride * (inDataPitch1 + inDataPitch2) * numX * numY);
+
+        vaIn1 = IVP_LA2NX8U_PP(pdvecData1);
+        vaIn2 = IVP_LA2NX8U_PP(pdvecData2);
+        vaIn3 = IVP_LA2NX8U_PP(pdvecData3);
+        vaIn4 = IVP_LA2NX8U_PP(pdvecData4);
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (k = 0; k < numInCh; k += 4)   /* (Input Channels * kWidth) loops combined */
+        {
+          /* Load 4 bytes of input data */
+          IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, 4);
+          IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, 4);
+          IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, 4);
+          IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, 4);
+
+#ifdef IVP_MULSUQA2N8XR8
+          /* Extracting first 4 bytes of vector into address register */
+          /* Scalar integers to be used for QMUL                      */
+          int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0);
+          int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0);
+          int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0);
+          int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0);
+#else
+          xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4;
+          xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8;
+          xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12;
+          xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16;
+          xb_vecNx16 vecData1, vecData2;
+          xb_vecNx16 vecData3, vecData4;
+          xb_vecNx16 vecData5, vecData6;
+          xb_vecNx16 vecData7, vecData8;
+          xb_vecNx16 vecTemp1, vecTemp2;
+
+          /* Custom select pattern for DSELs */
+          int16_t sel1       = ((XCHAL_IVPN_SIMD_WIDTH << 8));
+          xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1);
+          int16_t sel2       = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1);
+          xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2);
+
+          /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */
+          IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1);
+          IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1);
+          IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2);
+          IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2);
+
+          /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */
+          /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */
+          dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+          dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+          dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+          dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+          dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+          dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+          dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+          dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+
+          /* De-interleave a b a b a b... and move to a a a a... and b b b b... */
+          IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+          IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+          IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+          IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+#endif
+          /* 4 Aligned Vector Loads of coefficients */
+          xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+          IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+          IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+          IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+          /* Multiply unsigned x signed and accumulate to 24-bits */
+          IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2);
+          IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2);
+          IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2);
+          IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2);
+          IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, dvecCoeff4);
+          IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, dvecCoeff4);
+          IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, dvecCoeff4);
+          IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, dvecCoeff4);
+#endif
+        } /* End Corner case handling */
+
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        int8_t *pOut = pOutData + (outCh + x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        pdvecOut = (xb_vec2Nx8 *) pOut;
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pOut     = pOutData + (outCh + (x + 1) * outDataPitch1 + y * outDataPitch2) * bytesPerPixel * numX;
+        pdvecOut = (xb_vec2Nx8 *) pOut;
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pOut     = pOutData + (outCh + x * outDataPitch1 + (y + 1) * outDataPitch2) * bytesPerPixel * numY;
+        pdvecOut = (xb_vec2Nx8 *) pOut;
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pOut     = pOutData + (outCh + (x + 1) * outDataPitch1 + (y + 1) * outDataPitch2) * bytesPerPixel * numX * numY;
+        pdvecOut = (xb_vec2Nx8 *) pOut;
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/*****************************************************************************
+*  xaiConvolvedVQ3D_S_1x1_S8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for 1x1 MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate 1x1 MOD_DWH 3D         */
+/*               dilated convolution function and 1x1 MOD_DWH 3D VQ         */
+/*               dilated convolution function                               */
+/*               stride equal to 1                                          */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 1x1xDxN                                     */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_1x1_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 2) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \
+                    "Stride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \
+                    XAI_ERR_BADARG, "dilation parameter has to be >= 1");
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  if (numInCh % 4 == 0)
+  {
+#ifdef DILATED_VQ_CONV
+    convolvedVQ3D_S_1x1_S8S8IXCa2_MOD_DWH_x4(inTile, coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+    convolved3D_S_1x1_S8S8IXCa2_MOD_DWH_x4(inTile, coeffTile, biasArray, outTile, param);
+#endif
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecData1;
+  xb_vec2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecData3;
+  xb_vec2Nx8* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  /* Unrolled by 2 along both Output Width and Height.
+   * Inner loop unrolled by 4 along the Input number of Channels.
+   * Input Number of Channels less than 4 handled in a
+   * separate loop.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */
+  {                                                                     /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = (numOutCh - outCh);
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Corner case Handling if height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Corner case Handling if width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Pointer for Coefficient Load */
+        int8_t *pCoeff = pCoeffData + outCh;
+        pdvecCoeff = (xb_vec2Nx8 *) pCoeff;
+
+        /* Input Data Pointers */
+        int8_t *pData = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2;
+        pdvecData1 = (xb_vec2Nx8 *) pData;
+        pdvecData2 = (xb_vec2Nx8 *) (pData + stride * inDataPitch1 * numX);
+        pdvecData3 = (xb_vec2Nx8 *) (pData + stride * inDataPitch2 * numY);
+        pdvecData4 = (xb_vec2Nx8 *) (pData + stride * (inDataPitch1 + inDataPitch2) * numX * numY);
+
+        valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+        valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+        valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+        valign vaData4 = IVP_LA2NX8_PP(pdvecData4);
+        for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */
+        {
+          /* Aligning variable vector load of pixels */
+          xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+          xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+          xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+          xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4);
+
+          /* Extracting first 4 bytes of vector into address register */
+          /* Scalar integers to be used for QMUL                      */
+          int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+          int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+          int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+          int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+          /* 4 Aligned Vector Loads of coefficients */
+          xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+          /* Quad Muls */
+          IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+          IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+          IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+        } /* End Input Channels */
+
+        /* Corner Case Handling as No. of Input Channels not multiple of 4 */
+        {
+          int32_t remInCh = numInCh - inCh;
+
+          /* Aligning variable vector load of pixels */
+          xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh);
+          xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh);
+          xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh);
+          xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh);
+
+          /* Extracting first 4 bytes of vector into address register */
+          /* Scalar integers to be used for QMUL                      */
+          int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+          int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+          int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+          int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+          /* For conditional coefficient loads */
+          int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+          int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+          /* Coefficient Loads */
+          xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+          xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+          xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+          IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+          IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+          IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+        } /* End Corner case handling */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        int8_t *pOut = pOutData + (outCh + x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        pdvecOut = (xb_vec2Nx8 *) pOut;
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pOut     = pOutData + (outCh + (x + 1) * outDataPitch1 + y * outDataPitch2) * bytesPerPixel * numX;
+        pdvecOut = (xb_vec2Nx8 *) pOut;
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pOut     = pOutData + (outCh + x * outDataPitch1 + (y + 1) * outDataPitch2) * bytesPerPixel * numY;
+        pdvecOut = (xb_vec2Nx8 *) pOut;
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pOut     = pOutData + (outCh + (x + 1) * outDataPitch1 + (y + 1) * outDataPitch2) * bytesPerPixel * numX * numY;
+        pdvecOut = (xb_vec2Nx8 *) pOut;
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolvedVQ3D_S_1x1_U8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for 1x1 MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate 1x1 MOD_DWH 3D         */
+/*               dilated convolution function and 1x1 MOD_DWH 3D VQ         */
+/*               dilated convolution function                               */
+/*               stride equal to 1                                          */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is U8, CoeffData is S8                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 1x1xDxN                                     */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1_U8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_1x1_U8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_U8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 2) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \
+                    "Stride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \
+                    XAI_ERR_BADARG, "dilation parameter has to be >= 1");
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  if (numInCh % 4 == 0)
+  {
+#ifdef DILATED_VQ_CONV
+    convolvedVQ3D_S_1x1_U8S8IXCa2_MOD_DWH_x4(inTile, coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+    convolved3D_S_1x1_U8S8IXCa2_MOD_DWH_x4(inTile, coeffTile, biasArray, outTile, param);
+#endif
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  uint8_t *pInData   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, k, x, y;
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8U* restrict pdvecData1;
+  xb_vec2Nx8U* restrict pdvecData2;
+  xb_vec2Nx8U* restrict pdvecData3;
+  xb_vec2Nx8U* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  /* Vector data registers */
+  xb_vec2Nx8U dvecInData1, dvecInData2, dvecInData3, dvecInData4;
+  valign vaIn1, vaIn2, vaIn3, vaIn4;
+
+  /* Unrolled by 2 along both Output Width and Height.
+   * Inner loop unrolled by 4 along the Input number of Channels.
+   * Input Number of Channels less than 4 handled in a
+   * separate loop.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */
+  {                                                                     /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = (numOutCh - outCh);
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Corner case Handling if height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Corner case Handling if width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Pointer for Coefficient Load */
+        int8_t *pCoeff = pCoeffData + outCh;
+        pdvecCoeff = (xb_vec2Nx8 *) pCoeff;
+
+        /* Input Data Pointers */
+        uint8_t *pData = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2;
+        pdvecData1 = (xb_vec2Nx8U *) pData;
+        pdvecData2 = (xb_vec2Nx8U *) (pData + stride * inDataPitch1 * numX);
+        pdvecData3 = (xb_vec2Nx8U *) (pData + stride * inDataPitch2 * numY);
+        pdvecData4 = (xb_vec2Nx8U *) (pData + stride * (inDataPitch1 + inDataPitch2) * numX * numY);
+
+        vaIn1 = IVP_LA2NX8U_PP(pdvecData1);
+        vaIn2 = IVP_LA2NX8U_PP(pdvecData2);
+        vaIn3 = IVP_LA2NX8U_PP(pdvecData3);
+        vaIn4 = IVP_LA2NX8U_PP(pdvecData4);
+
+        for (k = 0; k < numInCh - 3; k += 4)   /* Input Channels  */
+        {
+          /* Aligning variable vector load of pixels */
+          IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, 4);
+          IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, 4);
+          IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, 4);
+          IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, 4);
+
+#ifdef IVP_MULSUQA2N8XR8
+          /* Extracting first 4 bytes of vector into address register */
+          /* Scalar integers to be used for QMUL                      */
+          int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0);
+          int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0);
+          int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0);
+          int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0);
+#else
+          xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4;
+          xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8;
+          xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12;
+          xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16;
+          xb_vecNx16 vecData1, vecData2;
+          xb_vecNx16 vecData3, vecData4;
+          xb_vecNx16 vecData5, vecData6;
+          xb_vecNx16 vecData7, vecData8;
+          xb_vecNx16 vecTemp1, vecTemp2;
+
+          /* Custom select pattern for DSELs */
+          int16_t sel1       = ((XCHAL_IVPN_SIMD_WIDTH << 8));
+          xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1);
+          int16_t sel2       = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1);
+          xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2);
+
+          /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */
+          IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1);
+          IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1);
+          IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2);
+          IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2);
+
+          /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */
+          /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */
+          dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+          dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+          dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+          dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+          dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+          dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+          dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+          dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+
+          /* De-interleave a b a b a b... and move to a a a a... and b b b b... */
+          IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+          IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+          IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+          IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+#endif
+          /* 4 Aligned Vector Loads of coefficients */
+          xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+          IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+          IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+          IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+          /* Multiply unsigned x signed and accumulate to 24-bits */
+          IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2);
+          IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2);
+          IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2);
+          IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2);
+          IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, dvecCoeff4);
+          IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, dvecCoeff4);
+          IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, dvecCoeff4);
+          IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, dvecCoeff4);
+#endif
+        } /* End Input Channels */
+
+        /* Corner Case Handling as No. of Input Channels not multiple of 4 */
+        {
+          int32_t remInCh = numInCh - k;
+
+          /* Aligning variable vector load of pixels */
+          IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, remInCh);
+          IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, remInCh);
+          IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, remInCh);
+          IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, remInCh);
+
+#ifdef IVP_MULSUQA2N8XR8
+          /* Extracting first 4 bytes of vector into address register */
+          /* Scalar integers to be used for QMUL                      */
+          int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0);
+          int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0);
+          int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0);
+          int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0);
+#else
+          xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4;
+          xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8;
+          xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12;
+          xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16;
+          xb_vecNx16 vecData1, vecData2;
+          xb_vecNx16 vecData3, vecData4;
+          xb_vecNx16 vecData5, vecData6;
+          xb_vecNx16 vecData7, vecData8;
+          xb_vecNx16 vecTemp1, vecTemp2;
+
+          /* Custom select pattern for DSELs */
+          int16_t sel1       = ((XCHAL_IVPN_SIMD_WIDTH << 8));
+          xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1);
+          int16_t sel2       = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1);
+          xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2);
+
+          /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */
+          IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1);
+          IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1);
+          IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2);
+          IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2);
+
+          /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */
+          /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */
+          dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+          dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+          dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+          dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+          dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+          dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+          dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+          dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+
+          /* De-interleave a b a b a b... and move to a a a a... and b b b b... */
+          IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+          IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+          IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+          IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+#endif
+          /* For conditional coefficient loads */
+          int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+          int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+          /* Coefficient Loads */
+          xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+          xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+          xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+          IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+          IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+          IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+          /* Multiply unsigned x signed and accumulate to 24-bits */
+          IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2);
+          IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2);
+          IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2);
+          IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2);
+          IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, 0);
+          IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, 0);
+          IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, 0);
+          IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, 0);
+#endif
+        } /* End Corner case handling */
+
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        int8_t *pOut = pOutData + (outCh + x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        pdvecOut = (xb_vec2Nx8 *) pOut;
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pOut     = pOutData + (outCh + (x + 1) * outDataPitch1 + y * outDataPitch2) * bytesPerPixel * numX;
+        pdvecOut = (xb_vec2Nx8 *) pOut;
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pOut     = pOutData + (outCh + x * outDataPitch1 + (y + 1) * outDataPitch2) * bytesPerPixel * numY;
+        pdvecOut = (xb_vec2Nx8 *) pOut;
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pOut     = pOutData + (outCh + (x + 1) * outDataPitch1 + (y + 1) * outDataPitch2) * bytesPerPixel * numX * numY;
+        pdvecOut = (xb_vec2Nx8 *) pOut;
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolvedVQ3D_S_2x2_S8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for 2x2 MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate 2x2 MOD_DWH 3D         */
+/*               dilated convolution function and 2x2 MOD_DWH 3D VQ         */
+/*               dilated convolution function                               */
+/*               stride equal to 1                                          */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 2x2xDxN                                     */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_2x2_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_2x2_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param)
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 2);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 2) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \
+                    "Stride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \
+                    XAI_ERR_BADARG, "dilation parameter has to be >= 1");
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param);
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);
+
+  /* Calling further optimized function if dim1Size == dim1Pitch */
+  if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && dilationX == 1 && dilationY == 1)
+  {
+    if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0)
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \
+                                                                outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \
+                                                              outTile, param);
+#endif
+    }
+    else
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \
+                                                             coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \
+                                                           coeffTile, biasArray, outTile, param);
+#endif
+    }
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile);
+    /* Max value of Gather Offset is ((stride*min(1, outW-1) + dilation) * inDataPitch1 +
+     * min(3, numInCh - 1) + ((stride*min(1, outH-1) * inDataPitch2)) */
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1_PITCH(inTile) <                                                                       \
+                    ((USHRT_MAX - XT_MIN(3, numInCh - 1) - XAI_CNN_CONV_GET_STRIDE(param) * XT_MIN(1, outH - 1) *             \
+                      XAI_TILE3D_GET_DIM2_PITCH(inTile)) / (XAI_CNN_CONV_GET_STRIDE(param) *                                  \
+                                                            XT_MIN(1, outW - 1) + XAI_CNN_CONV_GET_DILATION(param))),         \
+                    XAI_ERR_BADARG, "\ndim1Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \
+                    XAI_TILE3D_GET_DIM1_PITCH(inTile),                                                                        \
+                    ((USHRT_MAX - XT_MIN(3, numInCh - 1) - XAI_CNN_CONV_GET_STRIDE(param) * XT_MIN(1, outH - 1) *             \
+                      XAI_TILE3D_GET_DIM2_PITCH(inTile)) / (XAI_CNN_CONV_GET_STRIDE(param) *                                  \
+                                                            XT_MIN(1, outW - 1) + XAI_CNN_CONV_GET_DILATIONX(param))));
+  }
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  int32_t dilatedKWidth  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeight = dilationY * (kHeightU - 1) + 1;
+
+  const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag  = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  int32_t leftEdge, topEdge;
+
+  if ((dilatedKWidth % 2) != 0)
+  {
+    leftEdge = dilatedKWidth / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidth / 2) : ((dilatedKWidth / 2) - 1);
+  }
+
+  if ((dilatedKHeight % 2) != 0)
+  {
+    topEdge = dilatedKHeight / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeight / 2) : ((dilatedKHeight / 2) - 1);
+  }
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch2 + leftEdge * inDataPitch1)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+  valign vaOutData = IVP_ZALIGN();
+
+  /* Only 1 Gather is used in this approach to get the
+   * Input Data for 4 Output Vectors. In every Gather,
+   * 32 elements are read, where first 16 of them correspond
+   * to two vectors of Output along the width and the other
+   * 16 of them correspond to two vectors of Output along the height.
+   * To get the index values for the Gather, the following
+   * calculations are made.
+   */
+
+  /* Gather Index Calculations */
+  /* Sequence - 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ... */
+  xb_vecNx16U vecGatherOff = IVP_ANDNX16(IVP_SEQNX16(), 3);
+  xb_vecNx16 vecSelIdx     = IVP_SEQNX16();
+  /* To get the Select indexes as - 0 1 2 3 4 5 6 7 32 33 34 35 36.... */
+  IVP_ADDNX16T(vecSelIdx, vecSelIdx, 24, IVP_NOTBN(IVP_LTRNI(8)));
+  /* To get - 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 ... */
+  xb_vecNx16 vecSeqDiv4 = IVP_SRLINX16(IVP_SEQNX16(), 2);
+  /* Sequence - 0 1 2 3  d*P1 d*P1+1 d*P1+2 d*P1+3 */
+  IVP_MULANX16PACKL(vecGatherOff, vecSeqDiv4, dilationX * inDataPitch1);
+  vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride * inDataPitch1), \
+                             vecGatherOff, vecSelIdx);
+
+  xb_vecNx16 vecSelIdx2 = IVP_SEQNX16();
+  /* To get the Select indexes as - 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 32 33 34 35 36.... */
+  IVP_ADDNX16T(vecSelIdx2, vecSelIdx2, 16, IVP_NOTBN(IVP_LTRNI(16)));
+
+  vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride * inDataPitch2), \
+                             vecGatherOff, vecSelIdx2);
+
+  /* Final Index Pattern is -
+   *
+   * First 8 elements :
+   *    0       1       2       3
+   * d*P1  d*P1+1  d*P1+2  d*P1+3
+   *
+   * Second 8 elements :
+   *     s*P1      s*P1+1      s*P1+2      s*P1+3
+   * (s+d)*P1  (s+d)*P1+1  (s+d)*P1+2  (s+d)*P1+3
+   *
+   * Third 8 elements :
+   *    0+(s*P2)       1+(s*P2)       2+(s*P2)       3+(s*P2)
+   * d*P1+(s*P2)  d*P1+1+(s*P2)  d*P1+2+(s*P2)  d*P1+3+(s*P2)
+   *
+   * Last 8 elements :
+   *     s*P1+(s*P2)      s*P1+1+(s*P2)      s*P1+2+(s*P2)      s*P1+3+(s*P2)
+   * (s+d)*P1+(s*P2)  (s+d)*P1+1+(s*P2)  (s+d)*P1+2+(s*P2)  (s+d)*P1+3+(s*P2)
+   *
+   */
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+  xb_vec2Nx8* restrict pdvecOut;
+  int8_t*     restrict pData1;
+  int8_t*     restrict pData2;
+
+  int32_t remCh = numInCh & 3;
+
+  /*Generation of maskLut for handling cases when remCh is not equal to 0   */
+  /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF  */
+  /*    if remInCh is equal to 2 then sumMask is 00FFFFFF  */
+  const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 };
+
+  uint8_t remCh1 = XT_SALT(2, remCh + 1);
+  uint8_t remCh2 = XT_SALT(3, remCh + 1);
+
+  uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2;
+
+#ifdef __XCC__
+  XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+  /* Unrolled by 2 along both Output Width and Output Height.
+   * Also, unrolled along Input Channels by 4 and completely
+   * along the Kernel Width. Gathers are used for loading Input Data.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */
+  {                                                                     /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = (numOutCh - outCh);
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable used to handle the corner case of OutHeight being odd */
+      int32_t numY = XT_MIN(2, outH - y) - 1;
+
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        xb_vecNx16U vecGatherOff1;
+
+        /* Variable used to handle the corner case of Output Width being odd */
+        int32_t numX = XT_MIN(2, outW - x) - 1;
+
+        /* Output, Input and Coefficient Data Pointers */
+        int8_t *pOut   = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int8_t *pData  = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Boolean vectors to handle the corner cases of Out Width and Height being odd */
+        vboolN vbXY = IVP_LTRSN((16 * numY) + 8 * (numX + 1));
+
+        /* Pointer for Coefficient Load */
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3);
+
+        /* Assign valid address for predicated false lines */
+        vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, vbXY);
+
+        /* Pointer for Input Data Load corresponding to ky = 0 */
+        pData1 = pData;
+
+        /* Pointer for Input Data Load corresponding to ky = 1 */
+        pData2 = pData1 + (dilationY * inDataPitch2);
+
+        for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels Loop */
+        {
+          /* Gather Input Data correspoinding to ky = 0 */
+          xb_gsr gather1       = IVP_GATHERANX8S(pData1, vecGatherOff1);
+          xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1);
+
+          /* Gather Input Data corresponding to ky = 1 */
+          xb_gsr gather2       = IVP_GATHERANX8S(pData2, vecGatherOff1);
+          xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2);
+
+          /* ky = 0, kx = 0 */
+          /* Extracting scalar integers for QMULs */
+          int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+          int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 2);
+          int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 4);
+          int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 6);
+
+          /* 4 Aligned Vector Loads of coefficients */
+          xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, coeffPitch1);
+          xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff1, coeffPitch2 - \
+                                               3 * coeffPitch1);
+
+          IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+          IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+          IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+          /* ky = 0, kx = 1 */
+          /* Extracting scalar integers for QMULs */
+          qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                       1);
+          qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                       3);
+          qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                       5);
+          qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                       7);
+
+          /* 4 Aligned Vector Loads of coefficients */
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff1, coeffPitch1 - coeffPitch2);
+
+          IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+          IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+          IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+          /* ky = 1, kx = 0 */
+          /* Extracting scalar integers for QMULs */
+          qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                         (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+          qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                         (IVP_MOVNX16_FROM2NX8(dvecData2)), 2);
+          qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                         (IVP_MOVNX16_FROM2NX8(dvecData2)), 4);
+          qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                         (IVP_MOVNX16_FROM2NX8(dvecData2)), 6);
+
+          /* 4 Aligned Vector Loads of coefficients */
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff2, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff2, coeffPitch2 - 3 * coeffPitch1);
+
+          IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+          IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+          IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+          /* ky = 1, kx = 1 */
+          /* Extracting scalar integers for QMULs */
+          qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                       1);
+          qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                       3);
+          qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                       5);
+          qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                       7);
+
+          /* 4 Aligned Vector Loads of coefficients */
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff2, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, coeffPitch1);
+          IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff2, coeffPitch1 - coeffPitch2);
+
+          IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+          IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+          IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+          pData1 += 4;
+          pData2 += 4;
+        } /* End Input Channels */
+
+        /* Handling Corner cases of Number of Input Channels not being multiple of 4 */
+        if (inCh < numInCh)
+        {
+          int32_t remInCh  = numInCh - inCh;
+          vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh);
+
+          /* Gather Input Data */
+          xb_vec2Nx8 dvecData1 = 0;
+          xb_vec2Nx8 dvecData2 = 0;
+
+          /* Pointer for Input Data Load corresponding to ky = 0 */
+          pData1 = pData + inCh;
+
+          /* Pointer for Input Data Load corresponding to ky = 1 */
+          pData2 = pData1 + (dilationY * inDataPitch2);
+
+          /* Assign valid address for predicated false lines */
+          vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbXY));
+
+          /* Gather Input Data corresponding to ky = 0*/
+          xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1);
+          dvecData1 = IVP_GATHERD2NX8_L(gather1);
+
+          /* Gather Input Data corresponding to ky = 1 */
+          xb_gsr gather2 = IVP_GATHERANX8S(pData2, vecGatherOff1);
+          dvecData2 = IVP_GATHERD2NX8_L(gather2);
+
+          /* ky = 0, kx = 0 */
+          /* Extracting scalar integers for QMULs */
+          int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+          int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 2);
+          int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 4);
+          int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                 (IVP_MOVNX16_FROM2NX8(dvecData1)), 6);
+
+          /* Aligned Vector Loads of coefficients */
+          xb_vec2Nx8 dvecCoeff1;
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1 * remCh1);
+          xb_vec2Nx8 dvecCoeff2;
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1 * remCh2);
+          xb_vec2Nx8 dvecCoeff3;
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2)));
+
+          IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+          IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+          IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+          IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+          /* ky = 0, kx = 1 */
+          /* Extracting scalar integers for QMULs */
+          qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                       1);
+          qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                       3);
+          qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                       5);
+          qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                       7);
+
+          /* Aligned Vector Loads of coefficients */
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1 * remCh1);
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1 * remCh2);
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, 0);
+
+          IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+          IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+          IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+          IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+          /* ky = 1, kx = 0 */
+          /* Extracting scalar integers for QMULs */
+          qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                         (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+          qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                         (IVP_MOVNX16_FROM2NX8(dvecData2)), 2);
+          qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                         (IVP_MOVNX16_FROM2NX8(dvecData2)), 4);
+          qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                         (IVP_MOVNX16_FROM2NX8(dvecData2)), 6);
+
+          /* Aligned Vector Loads of coefficients */
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, coeffPitch1 * remCh1);
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff2, coeffPitch1 * remCh2);
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2)));
+
+
+          IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+          IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+          IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+          IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+          /* ky = 1, kx = 1 */
+          /* Extracting scalar integers for QMULs */
+          qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                       1);
+          qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                       3);
+          qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                       5);
+          qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                       7);
+
+          /* Aligned Vector Loads of coefficients */
+          IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, coeffPitch1 * remCh1);
+          IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff2, coeffPitch1 * remCh2);
+          IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, 0);
+
+          IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+          IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+          IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+          IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+        } /* End Input Channels Corner case Handling */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolvedVQ3D_S_3x3_S8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for 3x3 MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate 3x3 MOD_DWH 3D         */
+/*               dilated convolution function and 3x3 MOD_DWH 3D VQ         */
+/*               dilated convolution function                               */
+/*               stride equal to 1                                          */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 3x3xDxN                                     */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_3x3_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 3);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                         \
+                    XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 2) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \
+                    "\nStride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \
+                    XAI_ERR_BADARG, "dilation parameter has to be >= 1");
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_TILE3D_EDGE2(inTile, 1 + 1 * (XAI_CNN_CONV_GET_DILATIONX(param) - 1), 1 + 1 * (XAI_CNN_CONV_GET_DILATIONY(param) - 1));
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE,                                                      \
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);
+
+  /* Calling further optimized function if dim1Size == dim1Pitch */
+  if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && dilationX == 1 && dilationY == 1)
+  {
+    if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0)
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \
+                                                                outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \
+                                                              outTile, param);
+#endif
+    }
+    else
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \
+                                                             coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \
+                                                           coeffTile, biasArray, outTile, param);
+#endif
+    }
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile);
+    /* Max value of Gather Offset is ((stride*min(1, outW-1) + 2 * dilationX) * inDataPitch1 +
+     * min(3, numInCh - 1)) */
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1_PITCH(inTile) <                                                                       \
+                    ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / (XAI_CNN_CONV_GET_STRIDE(param) *                                 \
+                                                             XT_MIN(1, outW - 1) + 2 * XAI_CNN_CONV_GET_DILATION(param))),    \
+                    XAI_ERR_BADARG, "\ndim1Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \
+                    XAI_TILE3D_GET_DIM1_PITCH(inTile),                                                                        \
+                    ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / (XAI_CNN_CONV_GET_STRIDE(param) *                                 \
+                                                             XT_MIN(1, outW - 1) + 2 * XAI_CNN_CONV_GET_DILATION(param))));
+  }
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  int32_t dilatedKWidth  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeight = dilationY * (kHeightU - 1) + 1;
+
+  /* move to start of edge data only when input is already padded. */
+  pInData = &pInData[-(int32_t) ((dilatedKHeight / 2) * inDataPitch2 + (dilatedKWidth / 2) * inDataPitch1)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+  valign vaOutData = IVP_ZALIGN();
+
+  /* Only 2 Gathers are used in this approach to get the
+   * Input Data for 4 Output Vectors. In each Gather,
+   * 24 elements are read, where each 12 of them correspond
+   * to one vector of Output along the width. To get the
+   * index values for the Gather, the following calculations
+   * are made.
+   */
+
+  /* Gather Index Calculations */
+  /* Sequence - 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ... */
+  xb_vecNx16U vecGatherOff = IVP_ANDNX16(IVP_SEQNX16(), 3);
+  xb_vecNx16 vecSelIdx     = IVP_SEQNX16();
+  /* To get the Select indexes as - 0 1 2 3 4...11 32 33 34 35 36.... */
+  IVP_ADDNX16T(vecSelIdx, vecSelIdx, 20, IVP_NOTBN(IVP_LTRNI(12)));
+  /* To get - 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 ... */
+  xb_vecNx16 vecSeqDiv4 = IVP_SRLINX16(IVP_SEQNX16(), 2);
+  /* Sequence - 0 1 2 3  d*P1 d*P1+1 d*P1+2 d*P1+3 2.d*P1 2.d*P1+1 2.d*P1+2 2.d*P1+3 ... */
+  IVP_MULANX16PACKL(vecGatherOff, vecSeqDiv4, dilationX * inDataPitch1);
+  vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride * inDataPitch1), \
+                             vecGatherOff, vecSelIdx);
+  /* Final Index Pattern is -
+   * 0 1 2 3 d*P1 d*p1+1 d*P1+2 d*P1+3 d*2*P1 d*2*P1+1 d*2*P1+2 d*2*P1+3
+   * s*P1 s*P1+1 s*P1+2 s*P1+3 (s+1*d)*P1 (s+1*d)*P1+1 (s+1*d)*P1+2 (s+1*d)*P1+3
+   * (s+2*d)*P1 (s+2*d)*P1+1 (s+2*d)*P1+2 (s+2*d)*P1+3 */
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecOut;
+  int8_t*     restrict pData1;
+  int8_t*     restrict pData2;
+
+  int32_t remCh = numInCh & 3;
+
+  /*Generation of maskLut for handling cases when remInCh is not equal to 0   */
+  /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF  */
+  /*    if remInCh is equal to 2 then sumMask is 00FFFFFF  */
+  const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 };
+
+  uint8_t remCh1 = XT_SALT(2, remCh + 1);
+  uint8_t remCh2 = XT_SALT(3, remCh + 1);
+
+  uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2;
+
+#ifdef __XCC__
+  XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+  /* Unrolled by 2 along both Output Width and Output Height.
+   * Also, unrolled along Input Channels by 4 and completely
+   * along the Kernel Width. Gathers are used for loading Input Data.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */
+  {                                                                     /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = (numOutCh - outCh);
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable used to handle the corner case of OutHeight being odd */
+      int32_t numY = XT_MIN(2, outH - y) - 1;
+
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        xb_vecNx16U vecGatherOff1;
+        xb_vecNx16U vecGatherOff2;
+
+        /* Variable used to handle the corner case of Output Width being odd */
+        int32_t numX = XT_MIN(2, outW - x) - 1;
+
+        /* Output, Input and Coefficient Data Pointers */
+        int8_t *pOut   = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int8_t *pData  = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Boolean vectors to handle the corner cases of Out Width and Height being odd */
+        vboolN vbX = IVP_LTRSN(12 * (numX + 1));
+        vboolN vbY = IVP_LTRSN(12 * (numX + 1) * numY);
+
+        for (ky = 0; ky < 3; ky++) /* Kernel Height Loop */
+        {
+          /* Pointer for Input Data Load */
+          pData1 = pData + ky * dilationY * inDataPitch2;
+          pData2 = pData1 + (stride * inDataPitch2 * numY);
+          /* Assign valid address for predicated false lines */
+          vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, vbX);
+          vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, vbY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels Loop */
+          {
+            /* Gather Input Data */
+            xb_gsr gather1       = IVP_GATHERANX8S(pData1, vecGatherOff1);
+            xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1);
+            xb_gsr gather2       = IVP_GATHERANX8S(pData2, vecGatherOff2);
+            xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2);
+
+
+            pData1 += 4;
+            pData2 += 4;
+
+            /* kx = 1 */
+            /* Extracting scalar integers for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 3);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 3);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * \
+                                                 coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 2 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         4);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 3 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         5);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 - 2 * coeffPitch2);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          } /* End Input Channels */
+
+          /* Handling Corner cases of Number of Input Channels not being multiple of 4 */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh  = numInCh - inCh;
+            vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh);
+
+            /* Gather Input Data */
+            xb_vec2Nx8 dvecData1 = 0;
+            xb_vec2Nx8 dvecData2 = 0;
+            /* Assign valid address for predicated false lines */
+            vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbX));
+            vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbY));
+
+            xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1);
+            dvecData1 = IVP_GATHERD2NX8_L(gather1);
+            xb_gsr gather2 = IVP_GATHERANX8S(pData2, vecGatherOff2);
+            dvecData2 = IVP_GATHERD2NX8_L(gather2);
+
+            /* kx = 1 */
+            /* Extracting scalar integers for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 3);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 3);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1;
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            xb_vec2Nx8 dvecCoeff2;
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            xb_vec2Nx8 dvecCoeff3;
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2)));
+
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 2 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         4);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - coeffPitch1 * (remCh1 + remCh2));
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 3 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         5);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, 0);
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+          } /* End Input Channels Corner case Handling */
+        }   /* End Kernel Height Loop */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * \
+                       numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 *
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolvedVQ3D_S_3x3_U8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for 3x3 MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate 3x3 MOD_DWH 3D         */
+/*               dilated convolution function and 3x3 MOD_DWH 3D VQ         */
+/*               dilated convolution function                               */
+/*               stride equal to 1                                          */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is U8, CoeffData is S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 3x3xDxN                                     */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3_U8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_3x3_U8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_U8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 3);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                         \
+                    XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 2) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \
+                    "\nStride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \
+                    XAI_ERR_BADARG, "dilation parameter has to be >= 1");
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_TILE3D_EDGE2(inTile, 1 + 1 * (XAI_CNN_CONV_GET_DILATIONX(param) - 1), 1 + 1 * (XAI_CNN_CONV_GET_DILATIONY(param) - 1));
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE,                                                      \
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);
+
+#ifdef IVP_MULSUQA2N8XR8
+  /* Calling further optimized function if dim1Size == dim1Pitch */
+  if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && dilationX == 1 && dilationY == 1)
+  {
+    if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0)
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \
+                                                                outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \
+                                                              outTile, param);
+#endif
+    }
+    else
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \
+                                                             coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \
+                                                           coeffTile, biasArray, outTile, param);
+#endif
+    }
+    return(XAI_ERROR_STATUS());
+  }
+#endif //#ifdef IVP_MULSUQA2N8XR8
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile);
+    /* Max value of Gather Offset is ((stride*min(1, outW-1) + 2 * dilationX) * inDataPitch1 +
+     * min(3, numInCh - 1)) */
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1_PITCH(inTile) <                                                                       \
+                    ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / (XAI_CNN_CONV_GET_STRIDE(param) *                                 \
+                                                             XT_MIN(1, outW - 1) + 2 * XAI_CNN_CONV_GET_DILATION(param))),    \
+                    XAI_ERR_BADARG, "\ndim1Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \
+                    XAI_TILE3D_GET_DIM1_PITCH(inTile),                                                                        \
+                    ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / (XAI_CNN_CONV_GET_STRIDE(param) *                                 \
+                                                             XT_MIN(1, outW - 1) + 2 * XAI_CNN_CONV_GET_DILATION(param))));
+  }
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  uint8_t *pInData   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  int32_t dilatedKWidth  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeight = dilationY * (kHeightU - 1) + 1;
+
+  /* move to start of edge data only when input is already padded. */
+  pInData = &pInData[-(int32_t) ((dilatedKHeight / 2) * inDataPitch2 + (dilatedKWidth / 2) * inDataPitch1)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+  valign vaOutData = IVP_ZALIGN();
+
+  /* Only 2 Gathers are used in this approach to get the
+   * Input Data for 4 Output Vectors. In each Gather,
+   * 24 elements are read, where each 12 of them correspond
+   * to one vector of Output along the width. To get the
+   * index values for the Gather, the following calculations
+   * are made.
+   */
+
+  /* Gather Index Calculations */
+  /* Sequence - 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ... */
+  xb_vecNx16U vecGatherOff = IVP_ANDNX16(IVP_SEQNX16(), 3);
+  xb_vecNx16 vecSelIdx     = IVP_SEQNX16();
+  /* To get the Select indexes as - 0 1 2 3 4...11 32 33 34 35 36.... */
+  IVP_ADDNX16T(vecSelIdx, vecSelIdx, 20, IVP_NOTBN(IVP_LTRNI(12)));
+  /* To get - 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 ... */
+  xb_vecNx16 vecSeqDiv4 = IVP_SRLINX16(IVP_SEQNX16(), 2);
+  /* Sequence - 0 1 2 3  d*P1 d*P1+1 d*P1+2 d*P1+3 2.d*P1 2.d*P1+1 2.d*P1+2 2.d*P1+3 ... */
+  IVP_MULANX16PACKL(vecGatherOff, vecSeqDiv4, dilationX * inDataPitch1);
+  vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride * inDataPitch1), \
+                             vecGatherOff, vecSelIdx);
+  /* Final Index Pattern is -
+   * 0 1 2 3 d*P1 d*p1+1 d*P1+2 d*P1+3 d*2*P1 d*2*P1+1 d*2*P1+2 d*2*P1+3
+   * s*P1 s*P1+1 s*P1+2 s*P1+3 (s+1*d)*P1 (s+1*d)*P1+1 (s+1*d)*P1+2 (s+1*d)*P1+3
+   * (s+2*d)*P1 (s+2*d)*P1+1 (s+2*d)*P1+2 (s+2*d)*P1+3 */
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecOut;
+  uint8_t*     restrict pData1;
+  uint8_t*     restrict pData2;
+
+  int32_t remCh = numInCh & 3;
+
+  /*Generation of maskLut for handling cases when remInCh is not equal to 0   */
+  /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF  */
+  /*    if remInCh is equal to 2 then sumMask is 00FFFFFF  */
+  const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 };
+  uint8_t remCh1            = XT_SALT(2, remCh + 1);
+  uint8_t remCh2            = XT_SALT(3, remCh + 1);
+  uint32_t sumMask          = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2;
+
+
+#ifdef __XCC__
+  XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+  /* Unrolled by 2 along both Output Width and Output Height.
+   * Also, unrolled along Input Channels by 4 and completely
+   * along the Kernel Width. Gathers are used for loading Input Data.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */
+  {                                                                     /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = (numOutCh - outCh);
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable used to handle the corner case of OutHeight being odd */
+      int32_t numY = XT_MIN(2, outH - y) - 1;
+
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        xb_vecNx16U vecGatherOff1;
+        xb_vecNx16U vecGatherOff2;
+
+        /* Variable used to handle the corner case of Output Width being odd */
+        int32_t numX = XT_MIN(2, outW - x) - 1;
+
+        /* Output, Input and Coefficient Data Pointers */
+        int8_t *pOut   = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        uint8_t *pData = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Boolean vectors to handle the corner cases of Out Width and Height being odd */
+        vboolN vbX = IVP_LTRSN(12 * (numX + 1));
+        vboolN vbY = IVP_LTRSN(12 * (numX + 1) * numY);
+
+        for (ky = 0; ky < 3; ky++) /* Kernel Height Loop */
+        {
+          /* Pointer for Input Data Load */
+          pData1 = pData + ky * dilationY * inDataPitch2;
+          pData2 = pData1 + (stride * inDataPitch2 * numY);
+          /* Assign valid address for predicated false lines */
+          vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, vbX);
+          vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, vbY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels Loop */
+          {
+            /* Gather Input Data */
+            xb_gsr gather1        = IVP_GATHERANX8U(pData1, vecGatherOff1);
+            xb_vec2Nx8U dvecData1 = IVP_GATHERD2NX8U_L(gather1);
+            xb_gsr gather2        = IVP_GATHERANX8U(pData2, vecGatherOff2);
+            xb_vec2Nx8U dvecData2 = IVP_GATHERD2NX8U_L(gather2);
+
+
+            pData1 += 4;
+            pData2 += 4;
+
+            /* kx = 1 */
+            /* Extracting scalar integers for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData1)), 3);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData2)), 3);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * \
+                                                 coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            xb_vec2Nx8U dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1)));
+            xb_vec2Nx8U dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2)));
+            xb_vec2Nx8U dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3)));
+            xb_vec2Nx8U dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4)));
+
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), dvecCoeff4);
+#endif
+
+            /* kx = 2 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData1)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData1)), \
+                                         4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData2)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData2)), \
+                                         4);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1)));
+            dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2)));
+            dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3)));
+            dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4)));
+
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), dvecCoeff4);
+#endif
+
+            /* kx = 3 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData1)), \
+                                         2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData1)), \
+                                         5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData2)), \
+                                         2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData2)), \
+                                         5);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 - 2 * coeffPitch2);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1)));
+            dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2)));
+            dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3)));
+            dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4)));
+
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), dvecCoeff4);
+#endif
+          } /* End Input Channels */
+
+          /* Handling Corner cases of Number of Input Channels not being multiple of 4 */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh  = numInCh - inCh;
+            vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh);
+
+            /* Gather Input Data */
+            xb_vec2Nx8U dvecData1 = 0;
+            xb_vec2Nx8U dvecData2 = 0;
+            /* Assign valid address for predicated false lines */
+            vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbX));
+            vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbY));
+
+            xb_gsr gather1 = IVP_GATHERANX8U(pData1, vecGatherOff1);
+            dvecData1 = IVP_GATHERD2NX8U_L(gather1);
+            xb_gsr gather2 = IVP_GATHERANX8U(pData2, vecGatherOff2);
+            dvecData2 = IVP_GATHERD2NX8U_L(gather2);
+
+            /* kx = 1 */
+            /* Extracting scalar integers for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData1)), 3);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData2)), 3);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1;
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            xb_vec2Nx8 dvecCoeff2;
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            xb_vec2Nx8 dvecCoeff3;
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2)));
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+#else
+            xb_vec2Nx8U dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1 & sumMask)));
+            xb_vec2Nx8U dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2 & sumMask)));
+            xb_vec2Nx8U dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3 & sumMask)));
+            xb_vec2Nx8U dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4 & sumMask)));
+
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), 0);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), 0);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), 0);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), 0);
+#endif
+
+            /* kx = 2 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData1)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData1)), \
+                                         4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData2)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData2)), \
+                                         4);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - coeffPitch1 * (remCh1 + remCh2));
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+#else
+            dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1 & sumMask)));
+            dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2 & sumMask)));
+            dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3 & sumMask)));
+            dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4 & sumMask)));
+
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), 0);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), 0);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), 0);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), 0);
+#endif
+
+            /* kx = 3 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData1)), \
+                                         2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData1)), \
+                                         5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData2)), \
+                                         2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData2)), \
+                                         5);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, 0);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+#else
+            dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1 & sumMask)));
+            dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2 & sumMask)));
+            dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3 & sumMask)));
+            dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4 & sumMask)));
+
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), 0);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), 0);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), 0);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), 0);
+#endif
+          } /* End Input Channels Corner case Handling */
+        }   /* End Kernel Height Loop */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * \
+                       numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 *
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolvedVQ3D_S_4x4_S8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for 4x4 MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate 4x4 MOD_DWH 3D         */
+/*               dilated convolution function and 4x4 MOD_DWH 3D VQ         */
+/*               dilated convolution function                               */
+/*               stride equal to 1                                          */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 4x4xDxN                                     */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_4x4_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_4x4_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 4);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 2) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \
+                    "Stride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \
+                    XAI_ERR_BADARG, "dilation parameter has to be >= 1");
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param);
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE,                                                      \
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);
+
+  /* Calling further optimized function if dim1Size == dim1Pitch */
+  if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && dilationX == 1 && dilationY == 1)
+  {
+#ifdef DILATED_VQ_CONV
+    convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile,
+                                                              coeffTile,
+                                                              biasArray,
+                                                              outputScaleArray,
+                                                              outTile,
+                                                              param);
+#else
+    convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile,
+                                                            coeffTile,
+                                                            biasArray,
+                                                            outTile,
+                                                            param);
+#endif
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile);
+
+    /* Max value of Gather Offset is ((stride*min(outW-1, 1) + 3 * dilationX) * inDataPitch1 +
+     * min(3, numInCh - 1)) */
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1_PITCH(inTile) <                                                                                    \
+                    ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / (XAI_CNN_CONV_GET_STRIDE(param) *                                              \
+                                                             XT_MIN(1, outW - 1) + 3 * XAI_CNN_CONV_GET_DILATION(param))), XAI_ERR_BADARG, \
+                    "\ndim1Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d",                              \
+                    XAI_TILE3D_GET_DIM1_PITCH(inTile),                                                                                     \
+                    ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / (XAI_CNN_CONV_GET_STRIDE(param) *                                              \
+                                                             XT_MIN(1, outW - 1) + 3 * XAI_CNN_CONV_GET_DILATION(param))));
+  }
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+
+  const uint8_t outShiftU  = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride     = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  int32_t dilatedKWidth  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeight = dilationY * (kHeightU - 1) + 1;
+
+  const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag  = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  int32_t leftEdge, topEdge;
+
+  if ((dilatedKWidth % 2) != 0)
+  {
+    leftEdge = dilatedKWidth / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidth / 2) : ((dilatedKWidth / 2) - 1);
+  }
+
+  if ((dilatedKHeight % 2) != 0)
+  {
+    topEdge = dilatedKHeight / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeight / 2) : ((dilatedKHeight / 2) - 1);
+  }
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch2 + leftEdge * inDataPitch1)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+  valign vaOutData = IVP_ZALIGN();
+
+  /* Only 2 Gathers are used in this approach to get the
+   * Input Data for 4 Output Vectors. In each Gather,
+   * 32 elements are read, where each 16 of them correspond
+   * to one vector of Output along the width. To get the
+   * index values for the Gather, the following calculations
+   * are made.
+   */
+
+  /* Gather Index Calculations */
+  /* Sequence - 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ... */
+  xb_vecNx16U vecGatherOff = IVP_ANDNX16(IVP_SEQNX16(), 3);
+  xb_vecNx16 vecSelIdx     = IVP_SEQNX16();
+  /* To get the Select indexes as - 0 1 2 3 4...11 12 13 14 15 32 33 34 35 36.... */
+  IVP_ADDNX16T(vecSelIdx, vecSelIdx, 16, IVP_NOTBN(IVP_LTRNI(16)));
+  /* To get - 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 ... */
+  xb_vecNx16 vecSeqDiv4 = IVP_SRLINX16(IVP_SEQNX16(), 2);
+  /* Sequence - 0 1 2 3  d*P1 d*P1+1 d*P1+2 d*P1+3 2.d*P1 2.d*P1+1 2.d*P1+2 2.d*P1+3
+     3.d*P1 3.d*P1+1 3.d*P1+2 3.d*P1+3 ... */
+  IVP_MULANX16PACKL(vecGatherOff, vecSeqDiv4, dilationX * inDataPitch1);
+  vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride * inDataPitch1), \
+                             vecGatherOff, vecSelIdx);
+
+  /* Final Index Pattern is :
+   *
+   * First 16 elements :
+   *      0         1         2         3
+   * d*1*P1  d*1*P1+1  d*1*P1+2  d*1*P1+3
+   * d*2*P1  d*2*P1+1  d*2*P1+2  d*2*P1+3
+   * d*3*P1  d*3*P1+1  d*3*P1+2  d*3*P1+3
+   *
+   * Last 16 elements :
+   *       s*P1        s*P1+1        s*P1+2        s*P1+3
+   * (s+1*d)*P1  (s+1*d)*P1+1  (s+1*d)*P1+2  (s+1*d)*P1+3
+   * (s+2*d)*P1  (s+2*d)*P1+1  (s+2*d)*P1+2  (s+2*d)*P1+3
+   * (s+3*d)*P1  (s+3*d)*P1+1  (s+3*d)*P1+2  (s+3*d)*P1+3
+   *
+   */
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+  xb_vec2Nx8* restrict pdvecCoeff3;
+  xb_vec2Nx8* restrict pdvecCoeff4;
+  xb_vec2Nx8* restrict pdvecOut;
+  int8_t*     restrict pData1;
+  int8_t*     restrict pData2;
+
+  int32_t remCh = numInCh & 3;
+
+  /*Generation of maskLut for handling cases when remInCh is not equal to 0   */
+  /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF  */
+  /*    if remInCh is equal to 2 then sumMask is 00FFFFFF  */
+  const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 };
+
+  uint8_t remCh1 = XT_SALT(2, remCh + 1);
+  uint8_t remCh2 = XT_SALT(3, remCh + 1);
+
+  uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2;
+
+#ifdef __XCC__
+  XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+  /* Unrolled by 2 along both Output Width and Output Height.
+   * Also, unrolled along Input Channels by 4 and completely
+   * along the Kernel Width. Gathers are used for loading Input Data.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */
+  {                                                                     /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = (numOutCh - outCh);
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable used to handle the corner case of OutHeight being odd */
+      int32_t numY = XT_MIN(2, outH - y) - 1;
+
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        xb_vecNx16U vecGatherOff1;
+        xb_vecNx16U vecGatherOff2;
+
+        /* Variable used to handle the corner case of Output Width being odd */
+        int32_t numX = XT_MIN(2, outW - x) - 1;
+
+        /* Output, Input and Coefficient Data Pointers */
+        int8_t *pOut   = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int8_t *pData  = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Boolean vectors to handle the corner cases of Out Width and Height being odd */
+        vboolN vbX = IVP_LTRSN(16 * (numX + 1));
+        vboolN vbY = IVP_LTRSN(16 * (numX + 1) * numY);
+
+        for (ky = 0; ky < 4; ky++) /* Kernel Height Loop */
+        {
+          /* Pointer for Input Data Load */
+          pData1 = pData + ky * dilationY * inDataPitch2;
+          pData2 = pData1 + (stride * inDataPitch2 * numY);
+          /* Assign valid address for predicated false lines */
+          vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, vbX);
+          vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, vbY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3 + coeffPitch2);
+          pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3 + 2 * coeffPitch2);
+          pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3 + 3 * coeffPitch2);
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels Loop */
+          {
+            /* Gather Input Data */
+            xb_gsr gather1       = IVP_GATHERANX8S(pData1, vecGatherOff1);
+            xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1);
+            xb_gsr gather2       = IVP_GATHERANX8S(pData2, vecGatherOff2);
+            xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2);
+
+
+            pData1 += 4;
+            pData2 += 4;
+
+            /* kx = 1 */
+            /* Extracting scalar integers for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 4);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 4);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff1, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 2 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         5);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff2, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff2, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 3 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         6);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         6);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff3, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff3, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff3, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff3, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 4 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         3);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         7);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         3);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         7);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff4, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff4, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff4, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff4, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          } /* End Input Channels */
+
+          /* Handling Corner cases of Number of Input Channels not being multiple of 4 */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh  = numInCh - inCh;
+            vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh);
+
+            /* Gather Input Data */
+            xb_vec2Nx8 dvecData1 = 0;
+            xb_vec2Nx8 dvecData2 = 0;
+            /* Assign valid address for predicated false lines */
+            vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbX));
+            vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbY));
+
+            xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1);
+            dvecData1 = IVP_GATHERD2NX8_L(gather1);
+            xb_gsr gather2 = IVP_GATHERANX8S(pData2, vecGatherOff2);
+            dvecData2 = IVP_GATHERD2NX8_L(gather2);
+
+            /* kx = 1 */
+            /* Extracting scalar integers for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 4);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 4);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1;
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1 * remCh1);
+            xb_vec2Nx8 dvecCoeff2;
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1 * remCh2);
+            xb_vec2Nx8 dvecCoeff3;
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, 0);
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 2 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         5);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff2, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, 0);
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 3 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         6);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         6);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff3, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff3, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff3, 0);
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 4 */
+            /* Extracting scalar integers for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         3);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         7);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         3);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         7);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff4, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff4, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff4, 0);
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+          } /* End Input Channels Corner case Handling */
+        }   /* End Kernel Height Loop */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * \
+                       numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 *
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolvedVQ3D_S_5x5_S8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for 5x5 MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate 5x5 MOD_DWH 3D         */
+/*               dilated convolution function and 5x5 MOD_DWH 3D VQ         */
+/*               dilated convolution function                               */
+/*               stride equal to 1                                          */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 5x5xDxN                                     */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_5x5_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 5);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                         \
+                    XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 2) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \
+                    "\nStride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \
+                    XAI_ERR_BADARG, "dilation parameter has to be >= 1");
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_TILE3D_EDGE2(inTile, 2 + 2 * (XAI_CNN_CONV_GET_DILATIONX(param) - 1), 2 + 2 * (XAI_CNN_CONV_GET_DILATIONY(param) - 1));
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE,                                                      \
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+
+  /* Calling further optimized function if dim1Size == dim1Pitch */
+  if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && \
+      (XAI_CNN_CONV_GET_DILATIONX(param) == 1) && (XAI_CNN_CONV_GET_DILATIONY(param) == 1))
+  {
+    if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0)
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \
+                                                                outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \
+                                                              outTile, param);
+#endif
+    }
+    else
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \
+                                                             coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \
+                                                           coeffTile, biasArray, outTile, param);
+#endif
+    }
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW      = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH      = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh   = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh  = XAI_TILE3D_GET_DIM1(outTile);
+  const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile);
+    /* Max value of Gather Offset is ((stride*min(outW-1, 1) + 4 * dilationX) * inDataPitch1 +
+     * min(3, numInCh - 1)) */
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1_PITCH(inTile) <                                                                     \
+                    ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / (XAI_CNN_CONV_GET_STRIDE(param) *                               \
+                                                             XT_MIN(1, outW - 1) + 4 * XAI_CNN_CONV_GET_DILATION(param))),  \
+                    XAI_ERR_BADARG, "dim1Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \
+                    XAI_TILE3D_GET_DIM1_PITCH(inTile),                                                                      \
+                    ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / (XAI_CNN_CONV_GET_STRIDE(param) *                               \
+                                                             XT_MIN(1, outW - 1) + 4 * XAI_CNN_CONV_GET_DILATION(param))));
+  }
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  int32_t dilatedKWidth  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeight = dilationY * (kHeightU - 1) + 1;
+
+  /* move to start of edge data only when input is already padded. */
+  pInData = &pInData[-(int32_t) ((dilatedKHeight / 2) * inDataPitch2 + (dilatedKWidth / 2) * inDataPitch1)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+
+
+  /* 4 Gathers are being used to Load Input Data. Many common elements
+   * will be loaded in separate Gathers, especially in the case of
+   * stride 1 and 2. To take the advantage of having common offsets in
+   * a single Gather, 2 Gather Patterns are generated as given below.
+   * For example, in the Gather Patterns generated below,
+   * if stride is 1 and dilation equal to 1, then 8 offsets are common and if stride is 2, 4 offsets
+   * are common in each Gather.
+   */
+  /* Gather Index Calculations */
+  xb_vecNx16 vecGather = IVP_ANDNX16(IVP_SEQNX16(), 3);
+  IVP_MULANX16PACKL(vecGather, inDataPitch1 * dilationX, IVP_SRLINX16(IVP_SEQNX16(), 2));
+  xb_vecNx16 vecGather1 = IVP_ADDNX16(vecGather, stride * inDataPitch1);
+
+  xb_vecNx16 vecSelIdx1 = IVP_SEQNX16();
+  IVP_ADDNX16T(vecSelIdx1, vecSelIdx1, 20, IVP_NOTBN(IVP_LTRNI(12)));
+  xb_vecNx16U vecGatherOff1 = IVP_SELNX16(vecGather1, vecGather, vecSelIdx1);
+  xb_vecNx16 vecSelIdx2     = IVP_ADDNX16(IVP_SEQNX16(), 12);
+  IVP_ADDNX16T(vecSelIdx2, vecSelIdx2, 20, IVP_NOTBN(IVP_LTRNI(8)));
+  xb_vecNx16U vecGatherOff2 = IVP_SELNX16(vecGather1, vecGather, vecSelIdx2);
+  /* Index Pattern of vecGatherOff1 is -
+   * 0 1 2 3 d*P1 d*P1+1 d*P1+2 d*P1+3 d*2*P1 d*2*P1+1 d*2*P1+2 d*2*P1+3
+   * s*P1 s*P1+1 s*P1+2 s*dP1+3 (s+1*d)*P1 (s+1*d)*P1+1 (s+1*d)*P1+2 (s+1*d)*P1+3 */
+
+  /* Index Pattern of vecGatherOff2 is -
+   * d*3*P1 d*3*P1+1 d*3*P1+2 d*3*P1+3 d*4*P1 d*4*P1+1 d*4*P1+2 d*4*P1+3
+   * (s+2*d)*P1 (s+2*d)*P1+1 (s+2*d)*P1+2 (s+2*d)*P1+3
+   * (s+3*d)*P1 (s+3*d)*P1+1 (s+3*d)*P1+2 (s+3*d)*P1+3
+   * (s+4*d)*P1 (s+4*d)*P1+1 (s+4*d)*P1+2 (s+4*d)*P1+3 */
+
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecOut;
+  int8_t*     restrict pData1;
+  int8_t*     restrict pData2;
+
+  int32_t remCh = numInCh & 3;
+
+  /*Generation of maskLut for handling cases when remInCh is not equal to 0   */
+  /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF  */
+  /*    if remInCh is equal to 2 then sumMask is 00FFFFFF  */
+  const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 };
+
+  uint8_t remCh1 = XT_SALT(2, remCh + 1);
+  uint8_t remCh2 = XT_SALT(3, remCh + 1);
+
+  uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2;
+
+#ifdef __XCC__
+  XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+  /* 4 Gathers used for Input Data Load. Unrolled along */
+  /* Output Width and Height by 2. Also, unrolled along */
+  /* Input Channels by 4 and Kernel Width.              */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Out Channels */
+  {                                                                     /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable used for corner case handling of Out Height odd */
+      int32_t numY = XT_MIN(2, outH - y) - 1;
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        xb_vecNx16U vecGatherOff00;
+        xb_vecNx16U vecGatherOff01;
+        xb_vecNx16U vecGatherOff10;
+        xb_vecNx16U vecGatherOff11;
+        /* Variable used for corner case handling of Out Width odd */
+        int32_t numX = XT_MIN(2, outW - x) - 1;
+
+        /* Output, Input and Coefficient Data Pointers */
+        int8_t *pOut   = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int8_t *pData  = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Boolean Vectors for Predicate Gather with corner cases  */
+        /* handled for Out Width and Height being odd numbers      */
+        vboolN vb1 = IVP_ORBN(IVP_LTRNI(12), IVP_LTRSN(20 * numX));
+        vboolN vb2 = IVP_ORBN(IVP_LTRNI(8), IVP_LTRSN(20 * numX));
+        vboolN vb3 = IVP_ANDBN(vb1, IVP_LTRSN(20 * numY));
+        vboolN vb4 = IVP_ANDBN(vb2, IVP_LTRSN(20 * numY));
+
+        for (ky = 0; ky < 5; ky++) /* Kernel Height */
+        {
+          /* Pointer for Input Data Load */
+          pData1 = pData + ky * dilationY * inDataPitch2;
+          pData2 = pData1 + (stride * inDataPitch2 * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+          /* Assign valid address for predicated false lines */
+          vecGatherOff00 = IVP_MOVNX16UT(vecGatherOff1, 0, vb1);
+          vecGatherOff01 = IVP_MOVNX16UT(vecGatherOff2, 0, vb2);
+          vecGatherOff10 = IVP_MOVNX16UT(vecGatherOff1, 0, vb3);
+          vecGatherOff11 = IVP_MOVNX16UT(vecGatherOff2, 0, vb4);
+
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */
+          {
+            /* Gather Load of Input Data */
+            xb_gsr gather1       = IVP_GATHERANX8S(pData1, vecGatherOff00);
+            xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1);
+            xb_gsr gather2       = IVP_GATHERANX8S(pData1, vecGatherOff01);
+            xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2);
+            xb_gsr gather3       = IVP_GATHERANX8S(pData2, vecGatherOff10);
+            xb_vec2Nx8 dvecData3 = IVP_GATHERD2NX8_L(gather3);
+            xb_gsr gather4       = IVP_GATHERANX8S(pData2, vecGatherOff11);
+            xb_vec2Nx8 dvecData4 = IVP_GATHERD2NX8_L(gather4);
+
+
+            pData1 += 4;
+            pData2 += 4;
+
+            /* kx = 1 */
+            /* Extracting scalars for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 3);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 3);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * \
+                                                 coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 2 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \
+                                         4);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 3 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         2);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \
+                                         2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         2);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 4 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         0);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         3);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         0);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         3);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 5 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         4);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 - 4 * coeffPitch2);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          } /* End Input Channels */
+
+          /* Handling Corner cases of Number of Input Channels not being multiple of 4 */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh  = numInCh - inCh;
+            vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh);
+            /* Assign valid address for predicated false lines */
+            vecGatherOff00 = IVP_MOVNX16UT(vecGatherOff1, 0, IVP_ANDBN(vbRemInCh, vb1));
+            vecGatherOff01 = IVP_MOVNX16UT(vecGatherOff2, 0, IVP_ANDBN(vbRemInCh, vb2));
+            vecGatherOff10 = IVP_MOVNX16UT(vecGatherOff1, 0, IVP_ANDBN(vbRemInCh, vb3));
+            vecGatherOff11 = IVP_MOVNX16UT(vecGatherOff2, 0, IVP_ANDBN(vbRemInCh, vb4));
+
+            /* Gather Input Data */
+            xb_vec2Nx8 dvecData1 = 0;
+            xb_vec2Nx8 dvecData2 = 0;
+            xb_vec2Nx8 dvecData3 = 0;
+            xb_vec2Nx8 dvecData4 = 0;
+            xb_gsr gather1       = IVP_GATHERANX8S(pData1, vecGatherOff00);
+            dvecData1 = IVP_GATHERD2NX8_L(gather1);
+            xb_gsr gather2 = IVP_GATHERANX8S(pData1, vecGatherOff01);
+            dvecData2 = IVP_GATHERD2NX8_L(gather2);
+            xb_gsr gather3 = IVP_GATHERANX8S(pData2, vecGatherOff10);
+            dvecData3 = IVP_GATHERD2NX8_L(gather3);
+            xb_gsr gather4 = IVP_GATHERANX8S(pData2, vecGatherOff11);
+            dvecData4 = IVP_GATHERD2NX8_L(gather4);
+
+
+            /* kx = 1 */
+            /* Extracting scalars for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 3);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 3);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1;
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            xb_vec2Nx8 dvecCoeff2;
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            xb_vec2Nx8 dvecCoeff3;
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2)));
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 2 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \
+                                         4);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2)));
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 3 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \
+                                         2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         2);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \
+                                         2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         2);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2)));
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 4 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         0);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         3);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         0);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         3);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - coeffPitch1 * (remCh1 + remCh2));
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 5 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \
+                                         4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \
+                                         4);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, 0);
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+          } /* End Input Channels corner case handling */
+        }   /* End Kernel Height */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * \
+                       numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolvedVQ3D_S_7x7_S8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for 7x7 MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate 7x7 MOD_DWH 3D         */
+/*               dilated convolution function and 7x7 MOD_DWH 3D VQ         */
+/*               dilated convolution function                               */
+/*               Stride values = 1, 2 and 4 are supported.                  */
+/*               Implementation also supports dilation >= 1 for stride = 1  */
+/*               and dilation = 1 for stride = 2, 4                         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is 7x7xDxN                                     */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_7x7_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 7);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                         \
+                    XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 2) ||               \
+                    (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \
+                    "\nStride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \
+                    XAI_ERR_BADARG, "dilation parameter has to be >= 1");
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_TILE3D_EDGE(inTile, 3 + 3 * (XAI_CNN_CONV_GET_DILATION(param) - 1));
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE,                                                      \
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+  /* Calling further optimized function if dim1Size == dim1Pitch */
+  if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && \
+      (XAI_CNN_CONV_GET_DILATIONX(param) == 1) && (XAI_CNN_CONV_GET_DILATIONY(param) == 1))
+  {
+    if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0)
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \
+                                                                outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \
+                                                              outTile, param);
+#endif
+    }
+    else
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \
+                                                             coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \
+                                                           coeffTile, biasArray, outTile, param);
+#endif
+    }
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile);
+    /* Max value of Gather Offset is ((stride*min(1,outW-1) + 6*dilationX) * inDataPitch1 + min(3,numInCh-1)) */
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1_PITCH(inTile) <                                                                       \
+                    ((USHRT_MAX - XT_MIN(3, numInCh - 1)) /                                                                   \
+                     (XAI_CNN_CONV_GET_STRIDE(param) * XT_MIN(1, outW - 1) + 6 * XAI_CNN_CONV_GET_DILATION(param))),          \
+                    XAI_ERR_BADARG, "\ndim1Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \
+                    XAI_TILE3D_GET_DIM1_PITCH(inTile),                                                                        \
+                    ((USHRT_MAX - XT_MIN(3, numInCh - 1)) /                                                                   \
+                     (XAI_CNN_CONV_GET_STRIDE(param) * XT_MIN(1, outW - 1) + 6 * XAI_CNN_CONV_GET_DILATION(param))));
+  }
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+  const uint8_t dilationX     = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Effective Kernel size = dilation(KernelSize - 1) + 1                */
+  /* Effective kernel size is used for calculating the min required edge */
+  int32_t dilatedKWidth  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeight = dilationY * (kHeightU - 1) + 1;
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((dilatedKWidth / 2) * inDataPitch1 + (dilatedKHeight / 2) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+
+  /* 4 Gathers are being used to Load Input Data. Many common elements
+   * will be loaded in separate Gathers, especially in the case of
+   * stride 1 and 2. To take the advantage of having common offsets in
+   * a single Gather, 2 Gather Patterns are generated as given below.
+   * For example, in the Gather Patterns generated below,
+   * if stride is 1 and dilation = 1, then 12 offsets are common and  8 offsets
+   * if stride is 2 and dilation = 1, are common in each Gather.
+   */
+  /* Gather Index Calculations */
+  xb_vecNx16 vecGather = IVP_ANDNX16(IVP_SEQNX16(), 3);
+  IVP_MULANX16PACKL(vecGather, inDataPitch1 * dilationX, IVP_SRLINX16(IVP_SEQNX16(), 2));
+  xb_vecNx16 vecGather1 = IVP_ADDNX16(vecGather, stride * inDataPitch1);
+
+  xb_vecNx16 vecSelIdx1 = IVP_SEQNX16();
+  IVP_ADDNX16T(vecSelIdx1, vecSelIdx1, 16, IVP_NOTBN(IVP_LTRNI(16)));
+  xb_vecNx16U vecGatherOff1 = IVP_SELNX16(vecGather1, vecGather, vecSelIdx1);
+  xb_vecNx16 vecSelIdx2     = IVP_ADDNX16(IVP_SEQNX16(), 16);
+  IVP_ADDNX16T(vecSelIdx2, vecSelIdx2, 16, IVP_NOTBN(IVP_LTRNI(12)));
+  xb_vecNx16U vecGatherOff2 = IVP_SELNX16(vecGather1, vecGather, vecSelIdx2);
+  /* Index Pattern of vecGatherOff1 is -
+   * 0 1 2 3 P1*d P1*d+1 P1*d+2 P1*d+3 2*P1*d 2*P1*d+1 2*P1*d+2 2*P1*d+3 3*P1*d 3*P1*d+1 3*P1*d+2 3*P1*d+3
+   * s*P1 s*P1+1 s*P1+2 s*P1+3 (s+1*d)*P1 (s+1*d)*P1+1 (s+1*d)*P1+2 (s+1*d)*P1+3
+   * (s+2*d)*P1  (s+2*d)*P1+1  (s+2*d)*P1+2  (s+2*d)*P1+3 */
+
+  /* Index Pattern of vecGatherOff2 is -
+   * 4*P1*d 4*P1*d+1 4*P1*d+2 4*P1*d+3 5*P1*d 5*P1*d+1 5*P1*d+2 5*P1*d+3 6*P1*d 6*P1*d+1 6*P1*d+2 6*P1*d+3
+   * (s+3*d)*P1 (s+3*d)*P1+1 (s+3*d)*P1+2 (s+3*d)*P1+3 (s+4*d)*P1 (s+4*d)*P1*d+1 (s+4*d)*P1+2 (s+4*d)*P1+3
+   * (s+5*d)*P1 (s+5*d)*P1+1 (s+5*d)*P1+2 (s+5*d)*P1+3 (s+6*d)*P1 (s+6*d)*P1+1 (s+6*d)*P1+2 (s+6*d)*P1+3 */
+
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecOut;
+  int8_t*     restrict pData1;
+  int8_t*     restrict pData2;
+
+  int32_t remCh = numInCh & 3;
+
+  /*Generation of maskLut for handling cases when remCh is not equal to 0   */
+  /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF  */
+  /*    if remInCh is equal to 2 then sumMask is 00FFFFFF  */
+  const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 };
+
+  uint8_t remCh1 = XT_SALT(2, remCh + 1);
+  uint8_t remCh2 = XT_SALT(3, remCh + 1);
+
+  uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2;
+
+#ifdef __XCC__
+  XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+  /* 4 Gathers are used for Input Data Load corresponding to 4         */
+  /* Output Vectors. Loop unrolled along Output Width and Height by 2. */
+  /* Also unrolled along Input Channels by 4 and Kernel Width.         */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */
+  {                                                                     /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable used for corner case handling of Out Height odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        xb_vecNx16U vecGatherOff00;
+        xb_vecNx16U vecGatherOff01;
+        xb_vecNx16U vecGatherOff10;
+        xb_vecNx16U vecGatherOff11;
+        /* Variable used for corner case handling of Out Width odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output, Input and Coefficient Data Pointers */
+        int8_t *pOut   = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int8_t *pData  = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Boolean Vectors for Predicate Gather with corner cases  */
+        /* handled for Out Width and Height being odd numbers      */
+        vboolN vb1 = IVP_ORBN(IVP_LTRNI(16), IVP_LTRSN(28 * numX));
+        vboolN vb2 = IVP_ORBN(IVP_LTRNI(12), IVP_LTRSN(28 * numX));
+        vboolN vb3 = IVP_ANDBN(vb1, IVP_LTRSN(28 * numY));
+        vboolN vb4 = IVP_ANDBN(vb2, IVP_LTRSN(28 * numY));
+
+        for (ky = 0; ky < 7; ky++) /* Kernel Height */
+        {
+          /* Pointer for Input Data Load */
+          pData1 = pData + ky * inDataPitch2 * dilationY;
+          pData2 = pData1 + (stride * inDataPitch2 * numY);
+          /* Assign valid address for predicated false lines */
+          vecGatherOff00 = IVP_MOVNX16UT(vecGatherOff1, 0, vb1);
+          vecGatherOff01 = IVP_MOVNX16UT(vecGatherOff2, 0, vb2);
+          vecGatherOff10 = IVP_MOVNX16UT(vecGatherOff1, 0, vb3);
+          vecGatherOff11 = IVP_MOVNX16UT(vecGatherOff2, 0, vb4);
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Number of Input Channels */
+          {
+            /* Gathers for Input Loads */
+            xb_gsr gather1       = IVP_GATHERANX8S(pData1, vecGatherOff00);
+            xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1);
+            xb_gsr gather2       = IVP_GATHERANX8S(pData1, vecGatherOff01);
+            xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2);
+            xb_gsr gather3       = IVP_GATHERANX8S(pData2, vecGatherOff10);
+            xb_vec2Nx8 dvecData3 = IVP_GATHERD2NX8_L(gather3);
+            xb_gsr gather4       = IVP_GATHERANX8S(pData2, vecGatherOff11);
+            xb_vec2Nx8 dvecData4 = IVP_GATHERD2NX8_L(gather4);
+
+            pData1 += 4;
+            pData2 += 4;
+
+            /* kx = 1 */
+            /* Extracting Scalars for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 4);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 4);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 2 */
+            /* Extracting Scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 5);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 3 */
+            /* Extracting Scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 6);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 6);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 4 */
+            /* Extracting Scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 3);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 3);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 3);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 3);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 5 */
+            /* Extracting Scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 4);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 6 */
+            /* Extracting Scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 5);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+
+            /* kx = 7 */
+            /* Extracting Scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 6);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 6);
+
+            /* 4 Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 - 6 * coeffPitch2);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          } /* End Input Channels */
+
+          /* Handling Corner cases of Number of Input Channels not being multiple of 4 */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh  = numInCh - inCh;
+            vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh);
+            /* Assign valid address for predicated false lines */
+            vecGatherOff00 = IVP_MOVNX16UT(vecGatherOff1, 0, IVP_ANDBN(vbRemInCh, vb1));
+            vecGatherOff01 = IVP_MOVNX16UT(vecGatherOff2, 0, IVP_ANDBN(vbRemInCh, vb2));
+            vecGatherOff10 = IVP_MOVNX16UT(vecGatherOff1, 0, IVP_ANDBN(vbRemInCh, vb3));
+            vecGatherOff11 = IVP_MOVNX16UT(vecGatherOff2, 0, IVP_ANDBN(vbRemInCh, vb4));
+
+            /* Gather Input Data */
+            xb_vec2Nx8 dvecData1 = 0;
+            xb_vec2Nx8 dvecData2 = 0;
+            xb_vec2Nx8 dvecData3 = 0;
+            xb_vec2Nx8 dvecData4 = 0;
+            xb_gsr gather1       = IVP_GATHERANX8S(pData1, vecGatherOff00);
+            dvecData1 = IVP_GATHERD2NX8_L(gather1);
+            xb_gsr gather2 = IVP_GATHERANX8S(pData1, vecGatherOff01);
+            dvecData2 = IVP_GATHERD2NX8_L(gather2);
+            xb_gsr gather3 = IVP_GATHERANX8S(pData2, vecGatherOff10);
+            dvecData3 = IVP_GATHERD2NX8_L(gather3);
+            xb_gsr gather4 = IVP_GATHERANX8S(pData2, vecGatherOff11);
+            dvecData4 = IVP_GATHERD2NX8_L(gather4);
+
+
+            /* kx = 1 */
+            /* Extracting scalars for QMULs */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 4);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 4);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1;
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            xb_vec2Nx8 dvecCoeff2;
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            xb_vec2Nx8 dvecCoeff3;
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - coeffPitch1 * (remCh1 + remCh2));
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 2 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 5);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2)));
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 3 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 6);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 6);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - coeffPitch1 * (remCh1 + remCh2));
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 4 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 3);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 3);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 3);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 3);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh2 + remCh1)));
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 5 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 4);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 4);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (remCh1 + remCh2) * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 6 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 1);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 5);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 1);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 5);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (remCh1 + remCh2) * coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+
+            /* kx = 7 */
+            /* Extracting scalars for QMULs */
+            qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 2);
+            qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 6);
+            qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 2);
+            qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 6);
+
+            /* Aligned Vector Loads of coefficients */
+            IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1);
+            IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2);
+            IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, 0);
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask);
+          } /* End Input Channels corner case handling */
+        }   /* End Kernel Height */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for MxN MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate MxN MOD_DWH 3D         */
+/*               dilated convolution function and MxN MOD_DWH 3D VQ         */
+/*               dilated convolution function                               */
+/*               Stride values = 1, 2 and 4 are supported                   */
+/*               Implementation also supports dilation >= 1 for stride = 1  */
+/*               and dilation = 1 for stride = 2, 4                         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 15.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE, \
+                    "\nKernel height = %d, width = %d\nKernel width and height should be less than or equal to 64",  \
+                    XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile));
+    XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param);
+    XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) &&                                      \
+                    ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG,                    \
+                    "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \
+                    XAI_ERR_BADARG, "dilation parameter has to be >= 1");
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE,
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+  /* Calling further optimized function if dim1Size == dim1Pitch */
+  if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && \
+      (XAI_CNN_CONV_GET_DILATIONX(param) == 1) && (XAI_CNN_CONV_GET_DILATIONY(param) == 1))
+  {
+    if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0)
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \
+                                                                outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \
+                                                              outTile, param);
+#endif
+    }
+    else
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \
+                                                             coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \
+                                                           coeffTile, biasArray, outTile, param);
+#endif
+    }
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW      = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH      = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh   = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh  = XAI_TILE3D_GET_DIM1(outTile);
+  const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU   = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU  = XAI_TILE4D_GET_DIM4(coeffTile);
+  int32_t dilatedkWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedkHeightU = dilationY * (kHeightU - 1) + 1;
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t leftEdge, topEdge;
+  if ((dilatedkWidthU % 2) != 0)
+  {
+    leftEdge = dilatedkWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedkWidthU / 2) : ((dilatedkWidthU / 2) - 1);
+  }
+
+  if ((dilatedkHeightU % 2) != 0)
+  {
+    topEdge = dilatedkHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedkHeightU / 2) : ((dilatedkHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecData1;
+  xb_vec2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecData3;
+  xb_vec2Nx8* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        xb_vecN_2x32v hvecInAddrOff    = 0;
+        xb_vecN_2x32v hvecCoeffAddrOff = 0;
+        xb_vecN_2x32v hvecLaneIdx      = 0;
+        int32_t inAddrOff, coeffAddrOff;
+
+        for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */
+        {
+          /* Condition checks performed to get the Input and Coefficient        */
+          /* Pointer Offsets after combining the Kernel Width and Height Loops  */
+          vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU);
+          /* hvecLaneIdx will be reset to zero after every kWidth */
+          hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2);
+          /* InPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2);
+          /* CoeffPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2);
+          /* Extracting Input and Coefficient address offsets */
+          inAddrOff        = IVP_EXTRN_2X32(hvecInAddrOff, 0);
+          coeffAddrOff     = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0);
+          hvecLaneIdx      = IVP_ADDN_2X32(hvecLaneIdx, 1);
+          hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2);
+          hvecInAddrOff    = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX);
+
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + inAddrOff);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * numX);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + inAddrOff + strideY * inDataPitch2 * numY);
+          pdvecData4 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * numX + strideY * inDataPitch2 * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff);
+
+          /* Primes registers for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8_PP(pdvecData4);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          } /* End Input Channels */
+
+          /* Corner Case Handling if number of input channels not multiple of 4 */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh = numInCh - inCh;
+            vaData1 = IVP_LA2NX8_PP(pdvecData1);
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Coefficient Loads */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          } /* End Corner case handling */
+        }   /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************************************************************/
+/* Description : further optimized function if dim1Size == dim1Pitch        */
+/*               of 3D convolution for handling                             */
+/*               cases where kwidth * numInch is a multiple of 4            */
+/****************************************************************************/
+#ifdef DILATED_VQ_CONV
+static _XAI_INLINE_ void convolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void convolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+  xb_vecNx16U* restrict pOutScaleData;
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t leftEdge, topEdge;
+
+  if ((kWidthU % 2) != 0)
+  {
+    leftEdge = kWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1);
+  }
+
+  if ((kHeightU % 2) != 0)
+  {
+    topEdge = kHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1);
+  }
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+  int32_t numIter  = kWidthU * numInCh;
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecData1;
+  xb_vec2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecData3;
+  xb_vec2Nx8* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (y = 0; y < outH; y++)
+  {
+    for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+    { /* walk across the kernels */
+      /* To handle corner case when number of output channels
+       * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+      int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+      xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+      /*Load output scale values*/
+      pOutScaleData = (xb_vecNx16U *) (pScale + outCh);
+      VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+      for (x = 0; x < (outW - 3); x += 4) /* Image Width */
+      {                                   /* walk across the columns */
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + strideX * inDataPitch1);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + 2 * strideX * inDataPitch1);
+          pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + 3 * strideX * inDataPitch1);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8_PP(pdvecData4);
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+          for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 3 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+      if (x < outW)
+      {
+        int32_t enable2ndWidth = XT_SALT(1, outW - x);
+        int32_t enable3rdWidth = XT_SALT(2, outW - x);
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * enable2ndWidth);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + 2 * strideX * inDataPitch1 * enable3rdWidth);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+          for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      }
+    }     /* End Output Channels */
+  }
+}
+
+/****************************************************************************/
+/* Description : further optimized function if dim1Size == dim1Pitch        */
+/*               of 3D convolution                                          */
+/****************************************************************************/
+#ifdef DILATED_VQ_CONV
+static _XAI_INLINE_ void convolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void convolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+  xb_vecNx16U* restrict pOutScaleData;
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t numIter = kWidthU * numInCh;
+
+  int32_t leftEdge, topEdge;
+  if ((kWidthU % 2) != 0)
+  {
+    leftEdge = kWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1);
+  }
+
+  if ((kHeightU % 2) != 0)
+  {
+    topEdge = kHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1);
+  }
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecData1;
+  xb_vec2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecData3;
+  xb_vec2Nx8* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (y = 0; y < outH; y++)
+  {
+    for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+    { /* walk across the kernels */
+      /* To handle corner case when number of output channels
+       * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+      int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+      xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+      /*Load output scale values*/
+      pOutScaleData = (xb_vecNx16U *) (pScale + outCh);
+      VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+      for (x = 0; x < outW - 3; x += 4) /* Image Width */
+      {                                 /* walk across the columns */
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + strideX * inDataPitch1);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + 2 * strideX * inDataPitch1);
+          pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + 3 * strideX * inDataPitch1);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8_PP(pdvecData4);
+
+          for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          }   /* End Input Channels */
+
+          /* Corner case handling as numIter is not a multiple of 4 */
+          {
+            int32_t remInCh = numIter - k;
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 3 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+      if (x < outW)
+      {
+        int32_t enable2ndWidth = XT_SALT(1, outW - x);
+        int32_t enable3rdWidth = XT_SALT(2, outW - x);
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * enable2ndWidth);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + 2 * strideX * inDataPitch1 * enable3rdWidth);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+
+          for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          }   /* End Input Channels */
+
+          /* Corner case handling as numIter is not a multiple of 4 */
+          {
+            int32_t remInCh = numIter - k;
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      }
+    }     /* End Output Channels */
+  }
+}
+
+/***************xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH***********/
+/***************xaiConvolve3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH**************/
+/* Description : P6 optimized implementation for MxN MOD_DWH 3D convolution.*/
+/*               with loop across outTile as outermost loop. For H=1 , The  */
+/*               outermost loop will be executed only once                  */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 15.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE, \
+                    "\nKernel height = %d, width = %d\nKernel width and height should be less than or equal to 64",  \
+                    XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile));
+    XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param);
+    XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) &&                                      \
+                    ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG,                    \
+                    "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \
+                    XAI_ERR_BADARG, "dilation parameter has to be >= 1");
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE,
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Calling further optimized function if dim1Size == dim1Pitch */
+  if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && \
+      (XAI_CNN_CONV_GET_DILATIONX(param) == 1 && XAI_CNN_CONV_GET_DILATIONY(param) == 1))
+  {
+    if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0)
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \
+                                                                          outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \
+                                                                        outTile, param);
+#endif
+    }
+    else
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(inTile, \
+                                                                       coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(inTile, \
+                                                                     coeffTile, biasArray, outTile, param);
+#endif
+    }
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW      = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH      = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh   = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh  = XAI_TILE3D_GET_DIM1(outTile);
+  const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+  int32_t dilatedkWidth  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedkHeight = dilationY * (kHeightU - 1) + 1;
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+  xb_vecNx16U* restrict pOutScaleData;
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t leftEdge, topEdge;
+  if ((dilatedkWidth % 2) != 0)
+  {
+    leftEdge = dilatedkWidth / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedkWidth / 2) : ((dilatedkWidth / 2) - 1);
+  }
+
+  if ((dilatedkHeight % 2) != 0)
+  {
+    topEdge = dilatedkHeight / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedkHeight / 2) : ((dilatedkHeight / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, k, x, y;
+  int32_t inCh;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecData1;
+  xb_vec2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecData3;
+  xb_vec2Nx8* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  /* Loops Start */
+  for (y = 0; y < outH; y++)
+  {
+    for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+    { /* walk across the kernels */
+      /* To handle corner case when number of output channels
+       * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+      int32_t remainingOutCh = numOutCh - outCh;
+
+#ifdef DILATED_VQ_CONV
+      xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+      /*Load output scale values*/
+      pOutScaleData = (xb_vecNx16U *) (pScale + outCh);
+      VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+
+      for (x = 0; x < outW - 3; x += 4) /* Image Width */
+      {
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        xb_vecN_2x32v hvecInAddrOff    = 0;
+        xb_vecN_2x32v hvecCoeffAddrOff = 0;
+        xb_vecN_2x32v hvecLaneIdx      = 0;
+        int32_t inAddrOff              = 0, coeffAddrOff = 0;
+
+        for (k = 0; k < kWidthU * kHeightU; k++) /* Kernel Height * Kernel Width */
+        {
+          vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU);
+          /* hvecLaneIdx will be reset to zero after every kWidth */
+          hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2);
+          /* InPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2);
+          /* CoeffPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2);
+          /* Extracting Input and Coefficient address offsets */
+          inAddrOff        = IVP_EXTRN_2X32(hvecInAddrOff, 0);
+          coeffAddrOff     = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0);
+          hvecLaneIdx      = IVP_ADDN_2X32(hvecLaneIdx, 1);
+          hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2);
+          hvecInAddrOff    = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX);
+
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + inAddrOff);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * 2);
+          pdvecData4 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * 3);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff);
+
+          /* Primes registers for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8_PP(pdvecData4);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          } /* End Input Channels */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh = numInCh - inCh;
+            vaData1 = IVP_LA2NX8_PP(pdvecData1);
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Coefficient Loads */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          }
+        }
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 3 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      }
+      if (x < outW)
+      {
+        int32_t enable2ndWidth = XT_SALT(1, outW - x);
+        int32_t enable3rdWidth = XT_SALT(2, outW - x);
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        xb_vecN_2x32v hvecInAddrOff    = 0;
+        xb_vecN_2x32v hvecCoeffAddrOff = 0;
+        xb_vecN_2x32v hvecLaneIdx      = 0;
+        int32_t inAddrOff              = 0, coeffAddrOff = 0;
+
+        for (k = 0; k < kWidthU * kHeightU; k++) /* Kernel Height * Kernel Width */
+        {
+          vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU);
+          /* hvecLaneIdx will be reset to zero after every kWidth */
+          hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2);
+          /* InPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2);
+          /* CoeffPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2);
+          /* Extracting Input and Coefficient address offsets */
+          inAddrOff        = IVP_EXTRN_2X32(hvecInAddrOff, 0);
+          coeffAddrOff     = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0);
+          hvecLaneIdx      = IVP_ADDN_2X32(hvecLaneIdx, 1);
+          hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2);
+          hvecInAddrOff    = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX);
+
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + inAddrOff);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * enable2ndWidth);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * 2 * enable3rdWidth);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff);
+
+          /* Primes registers for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          } /* End Input Channels */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh = numInCh - inCh;
+            vaData1 = IVP_LA2NX8_PP(pdvecData1);
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Coefficient Loads */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          }
+        }
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for MxN MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate MxN MOD_DWH 3D         */
+/*               dilated convolution function and MxN MOD_DWH 3D VQ         */
+/*               dilated convolution function                               */
+/*               Stride values = 1, 2 and 4 are supported                   */
+/*               Implementation also supports dilation >= 1 for stride = 1  */
+/*               and dilation = 1 for stride = 2, 4                         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is U8, CoeffData is S8                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 15.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               inChannels is a multiple of 2                              */
+/*               Active data pointer is aligned to 2-bytes                  */
+/****************************************************************************/
+#ifndef IVP_MULSUQA2N8XR8
+#ifdef DILATED_VQ_CONV
+static _XAI_INLINE_ void convolvedVQ3D_S_MxN_U8S8IXCa2_depth2X_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void convolved3D_S_MxN_U8S8IXCa2_depth2X_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t numIter = kWidthU * numInCh;
+
+  int32_t leftEdge, topEdge;
+  if ((kWidthU % 2) != 0)
+  {
+    leftEdge = kWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1);
+  }
+
+  if ((kHeightU % 2) != 0)
+  {
+    topEdge = kHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1);
+  }
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  uint16_t* restrict pData1;
+  uint16_t* restrict pData2;
+  uint16_t* restrict pData3;
+  uint16_t* restrict pData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  xb_vecNx16 vecData1, vecData2, vecData3, vecData4;
+  xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4;
+  xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8;
+  xb_vecNx16 vecTemp1, vecTemp2;
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = ((int8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2);
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pData1 = (uint16_t *) (pData + ky * inDataPitch2);
+          pData2 = (uint16_t *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * numX);
+          pData3 = (uint16_t *) (pData + ky * inDataPitch2 + strideY * inDataPitch2 * numY);
+          pData4 = (uint16_t *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * numX + strideY * inDataPitch2 * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+          for (k = 0; k < numIter; k += 2) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Load 2 bytes of input data */
+            IVP_LSRNX16U_XP(vecData1, pData1, 2);
+            IVP_LSRNX16U_XP(vecData2, pData2, 2);
+            IVP_LSRNX16U_XP(vecData3, pData3, 2);
+            IVP_LSRNX16U_XP(vecData4, pData4, 2);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+
+            /* De-interleave a b a b a b... and move to a a a a... and b b b b... */
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData1, vecData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData1 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData2 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData2, vecData2, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData3 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData4 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData3, vecData3, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData5 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData6 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData4, vecData4, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData7 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData8 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+
+            /* Multiply unsigned x signed and accumulate to 24-bits */
+            IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2);
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+#endif
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for MxN MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate MxN MOD_DWH 3D         */
+/*               dilated convolution function and MxN MOD_DWH 3D VQ         */
+/*               dilated convolution function                               */
+/*               Implementation also supports dilation > 1 for stride = 1   */
+/*               and dilation = 1 for stride = 2, 4                         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is U8, CoeffData is S8                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 15.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/****************************************************************************/
+/* Although this routine supports IVP_MULSUQA2N8XR8, it has been intentionally disabled because we are not using it for the core that supports IVP_MULSUQA2N8XR8.
+   We will be using convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth_x4 and convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth.
+   These routines are faster than convolvedVQ3D_S_MxNdX_U8S8IXCa2_MOD_DWH */
+#ifndef IVP_MULSUQA2N8XR8
+#ifdef DILATED_VQ_CONV
+static _XAI_INLINE_ void convolvedVQ3D_S_MxNdX_U8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void convolved3D_S_MxNdX_U8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW      = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH      = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh   = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh  = XAI_TILE3D_GET_DIM1(outTile);
+  const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+  int32_t dilatedkWidth  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedkHeight = dilationY * (kHeightU - 1) + 1;
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  uint8_t *pInData   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t leftEdge, topEdge;
+  if ((dilatedkWidth % 2) != 0)
+  {
+    leftEdge = dilatedkWidth / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedkWidth / 2) : ((dilatedkWidth / 2) - 1);
+  }
+
+  if ((dilatedkHeight % 2) != 0)
+  {
+    topEdge = dilatedkHeight / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedkHeight / 2) : ((dilatedkHeight / 2) - 1);
+  }
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  /* Vector data pointers */
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8U* restrict pdvecData1;
+  xb_vec2Nx8U* restrict pdvecData2;
+  xb_vec2Nx8U* restrict pdvecData3;
+  xb_vec2Nx8U* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+#ifndef IVP_MULSUQA2N8XR8
+  /* Vector data registers */
+  xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4;
+  xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8;
+  xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12;
+  xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16;
+  xb_vecNx16 vecData1, vecData2;
+  xb_vecNx16 vecData3, vecData4;
+  xb_vecNx16 vecData5, vecData6;
+  xb_vecNx16 vecData7, vecData8;
+  xb_vecNx16 vecTemp1, vecTemp2;
+
+  /* Custom select pattern for DSELs */
+  int16_t sel1       = (XCHAL_IVPN_SIMD_WIDTH << 8);
+  xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1);
+  int16_t sel2       = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1);
+  xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2);
+#endif
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        uint8_t *pData = ((uint8_t *) pInData + x * inDataPitch1 + y * inDataPitch2);
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        xb_vecN_2x32v hvecInAddrOff    = 0;
+        xb_vecN_2x32v hvecCoeffAddrOff = 0;
+        xb_vecN_2x32v hvecLaneIdx      = 0;
+        int32_t inAddrOff, coeffAddrOff;
+
+        for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */
+        {
+          /* Condition checks performed to get the Input and Coefficient        */
+          /* Pointer Offsets after combining the Kernel Width and Height Loops  */
+          vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU);
+          /* hvecLaneIdx will be reset to zero after every kWidth */
+          hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2);
+          /* InPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2);
+          /* CoeffPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2);
+          /* Extracting Input and Coefficient address offsets */
+          inAddrOff        = IVP_EXTRN_2X32(hvecInAddrOff, 0);
+          coeffAddrOff     = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0);
+          hvecLaneIdx      = IVP_ADDN_2X32(hvecLaneIdx, 1);
+          hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2);
+          hvecInAddrOff    = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX);
+
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8U *) (pData + inAddrOff);
+          pdvecData2 = (xb_vec2Nx8U *) (pData + inAddrOff + inDataPitch1 * numX);
+          pdvecData3 = (xb_vec2Nx8U *) (pData + inAddrOff + inDataPitch2 * numY);
+          pdvecData4 = (xb_vec2Nx8U *) (pData + inAddrOff + inDataPitch1 * numX + inDataPitch2 * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff);
+
+          /* Priming input loads */
+          valign vaIn1 = IVP_LA2NX8U_PP(pdvecData1);
+          valign vaIn2 = IVP_LA2NX8U_PP(pdvecData2);
+          valign vaIn3 = IVP_LA2NX8U_PP(pdvecData3);
+          valign vaIn4 = IVP_LA2NX8U_PP(pdvecData4);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */
+          {
+            xb_vec2Nx8U dvecInData1, dvecInData2, dvecInData3, dvecInData4;
+            /* Aligning variable vector load of pixels */
+            IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, 4);
+            IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, 4);
+            IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, 4);
+            IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, 4);
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0);
+#else
+            /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */
+            IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1);
+            IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1);
+            IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2);
+            IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2);
+
+            /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */
+            /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */
+            dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+
+            /* De-interleave a b a b a b... and move to a a a a... and b b b b... */
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+#endif
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            /* Multiply unsigned x signed and accumulate to 24-bits */
+            IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, dvecCoeff4);
+#endif
+          }   /* End Input Channels */
+          /* Corner Case Handling if number of input channels not multiple of 4 */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh = numInCh - inCh;
+            vaIn1 = IVP_LA2NX8U_PP(pdvecData1);
+            xb_vec2Nx8U dvecInData1, dvecInData2, dvecInData3, dvecInData4;
+            /* Aligning variable vector load of pixels */
+            IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, remInCh);
+            IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, remInCh);
+            IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, remInCh);
+            IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, remInCh);
+
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0);
+#else
+            /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */
+            IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1);
+            IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1);
+            IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2);
+            IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2);
+
+            /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */
+            /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */
+            dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+
+            /* De-interleave a b a b a b... and move to a a a a... and b b b b... */
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+#endif
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            /* Multiply unsigned x signed and accumulate to 24-bits */
+            IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2);
+            IVP_MULUSA2NX8(daccSum1, dvecData9, dvecCoeff3);
+            IVP_MULUSA2NX8(daccSum2, dvecData11, dvecCoeff3);
+            IVP_MULUSA2NX8(daccSum3, dvecData13, dvecCoeff3);
+            IVP_MULUSA2NX8(daccSum4, dvecData15, dvecCoeff3);
+#endif
+          }    /* End Corner case handling */
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+#endif
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for MxN MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate MxN MOD_DWH 3D         */
+/*               dilated convolution function and MxN MOD_DWH 3D VQ         */
+/*               dilated convolution function                               */
+/*               Stride values = 1, 2 and 4 are supported                   */
+/*               Implementation also supports dilation >= 1 for stride = 1  */
+/*               and dilation = 1 for stride = 2, 4                         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is U8, CoeffData is S8                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 15.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               No edges along dimension 1 of inTile                       */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_MxN_U8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_U8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE, \
+                    "\nKernel height = %d, width = %d\nKernel width and height should be less than or equal to 64",  \
+                    XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile));
+    XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param);
+    XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) &&                                      \
+                    ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG,                    \
+                    "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \
+                    XAI_ERR_BADARG, "dilation parameter has to be >= 1");
+    XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_PITCH(inTile) == XAI_TILE3D_GET_DIM1(inTile)) \
+                      && XAI_CNN_CONV_GET_DILATION(param) == 1) || XAI_CNN_CONV_GET_DILATION(param) > 1),
+                    XAI_ERR_BADARG, "Edges along input channels is not supported if dilation = 1.");
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE,                                                      \
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+#ifdef IVP_MULSUQA2N8XR8 // only for Vision_130
+  if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && \
+      (XAI_CNN_CONV_GET_DILATIONX(param) == 1 && XAI_CNN_CONV_GET_DILATIONY(param) == 1))
+  {
+    if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0)
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, outTile, param);
+#endif
+    }
+    else
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth(inTile, coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth(inTile, coeffTile, biasArray, outTile, param);
+#endif
+    }
+    return(XAI_ERROR_STATUS());
+  }
+#else // Vision_P6
+  if (XAI_CNN_CONV_GET_DILATIONX(param) > 1 && XAI_CNN_CONV_GET_DILATIONY(param) > 1)
+  {
+#ifdef DILATED_VQ_CONV
+    convolvedVQ3D_S_MxNdX_U8S8IXCa2_MOD_DWH(inTile, \
+                                            coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+    convolved3D_S_MxNdX_U8S8IXCa2_MOD_DWH(inTile, \
+                                          coeffTile, biasArray, outTile, param);
+#endif
+    return(XAI_ERROR_STATUS());
+  }
+  if ((XAI_CNN_CONV_GET_DILATIONX(param) == 1 && XAI_CNN_CONV_GET_DILATIONY(param) == 1) && \
+      (XAI_TILE3D_GET_DIM1(inTile) % 2) == 0                                                \
+      && ((XAI_PTR_TO_ADDR(XAI_TILE3D_GET_DATA_PTR(inTile)) & (2 - 1)) == 0))
+  {
+#ifdef DILATED_VQ_CONV
+    convolvedVQ3D_S_MxN_U8S8IXCa2_depth2X_MOD_DWH(inTile, \
+                                                  coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+    convolved3D_S_MxN_U8S8IXCa2_depth2X_MOD_DWH(inTile, \
+                                                coeffTile, biasArray, outTile, param);
+#endif
+    return(XAI_ERROR_STATUS());
+  }
+#endif
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW      = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH      = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh   = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh  = XAI_TILE3D_GET_DIM1(outTile);
+  const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+  int32_t dilatedkWidth  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedkHeight = dilationY * (kHeightU - 1) + 1;
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  uint8_t *pInData   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  //int32_t numIter = kWidthU * numInCh;
+
+  int32_t leftEdge, topEdge;
+  if ((dilatedkWidth % 2) != 0)
+  {
+    leftEdge = dilatedkWidth / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedkWidth / 2) : ((dilatedkWidth / 2) - 1);
+  }
+
+  if ((dilatedkHeight % 2) != 0)
+  {
+    topEdge = dilatedkHeight / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedkHeight / 2) : ((dilatedkHeight / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  /* Vector data pointers */
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8U* restrict pdvecData1;
+  xb_vec2Nx8U* restrict pdvecData2;
+  xb_vec2Nx8U* restrict pdvecData3;
+  xb_vec2Nx8U* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  /* Vector data registers */
+  xb_vec2Nx8U dvecInData1, dvecInData2, dvecInData3, dvecInData4;
+
+  valign vaIn1, vaIn2, vaIn3, vaIn4;
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */
+  {                                                                     /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = (numOutCh - outCh);
+#ifdef DILATED_VQ_CONV
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2);
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        xb_vecN_2x32v hvecInAddrOff    = 0;
+        xb_vecN_2x32v hvecCoeffAddrOff = 0;
+        xb_vecN_2x32v hvecLaneIdx      = 0;
+        int32_t inAddrOff, coeffAddrOff;
+
+        for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */
+        {
+          /* Condition checks performed to get the Input and Coefficient        */
+          /* Pointer Offsets after combining the Kernel Width and Height Loops  */
+          vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU);
+          /* hvecLaneIdx will be reset to zero after every kWidth */
+          hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2);
+          /* InPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2);
+          /* CoeffPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2);
+          /* Extracting Input and Coefficient address offsets */
+          inAddrOff        = IVP_EXTRN_2X32(hvecInAddrOff, 0);
+          coeffAddrOff     = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0);
+          hvecLaneIdx      = IVP_ADDN_2X32(hvecLaneIdx, 1);
+          hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2);
+          hvecInAddrOff    = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX);
+
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8U *) (pData + inAddrOff);
+          pdvecData2 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * numX);
+          pdvecData3 = (xb_vec2Nx8U *) (pData + inAddrOff + strideY * inDataPitch2 * numY);
+          pdvecData4 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * numX + strideY * inDataPitch2 * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff);
+
+          /* Priming input loads */
+          vaIn1 = IVP_LA2NX8U_PP(pdvecData1);
+          vaIn2 = IVP_LA2NX8U_PP(pdvecData2);
+          vaIn3 = IVP_LA2NX8U_PP(pdvecData3);
+          vaIn4 = IVP_LA2NX8U_PP(pdvecData4);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */
+          {
+            /* Aligning variable vector load of pixels */
+
+
+            /* Load 4 bytes of input data */
+            IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, 4);
+            IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, 4);
+            IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, 4);
+            IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, 4);
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0);
+#else
+            xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4;
+            xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8;
+            xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12;
+            xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16;
+            xb_vecNx16 vecData1, vecData2;
+            xb_vecNx16 vecData3, vecData4;
+            xb_vecNx16 vecData5, vecData6;
+            xb_vecNx16 vecData7, vecData8;
+            xb_vecNx16 vecTemp1, vecTemp2;
+
+            /* Custom select pattern for DSELs */
+            int16_t sel1       = ((XCHAL_IVPN_SIMD_WIDTH << 8));
+            xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1);
+            int16_t sel2       = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1);
+            xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2);
+
+            /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */
+            IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1);
+            IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1);
+            IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2);
+            IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2);
+
+            /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */
+            /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */
+            dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+
+            /* De-interleave a b a b a b... and move to a a a a... and b b b b... */
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+#endif
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            /* Multiply unsigned x signed and accumulate to 24-bits */
+            IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, dvecCoeff4);
+#endif
+          } /* End Input Channels */
+
+          /* Corner Case Handling if number of input channels not multiple of 4 */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh = numInCh - inCh;
+            vaIn1 = IVP_LA2NX8U_PP(pdvecData1);
+
+            /* Load 4 bytes of input data */
+            IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, remInCh);
+            IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, remInCh);
+            IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, remInCh);
+            IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, remInCh);
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0);
+#else
+            xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4;
+            xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8;
+            xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12;
+            xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16;
+            xb_vecNx16 vecData1, vecData2;
+            xb_vecNx16 vecData3, vecData4;
+            xb_vecNx16 vecData5, vecData6;
+            xb_vecNx16 vecData7, vecData8;
+            xb_vecNx16 vecTemp1, vecTemp2;
+
+            /* Custom select pattern for DSELs */
+            int16_t sel1       = ((XCHAL_IVPN_SIMD_WIDTH << 8));
+            xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1);
+            int16_t sel2       = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1);
+            xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2);
+
+            /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */
+            IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1);
+            IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1);
+            IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2);
+            IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2);
+
+            /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */
+            /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */
+            dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+
+            /* De-interleave a b a b a b... and move to a a a a... and b b b b... */
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+#endif
+
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Coefficient Loads */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            /* Multiply unsigned x signed and accumulate to 24-bits */
+            IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, 0);
+            IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, 0);
+            IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, 0);
+            IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, 0);
+#endif
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh * numX * numY);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************************************************************/
+/* Description : P6 optimized implementation for noUnrollH MxN MOD_DWH      */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate MxN MOD_DWH 3D         */
+/*               dilated convolution function and MxN MOD_DWH 3D VQ         */
+/*               dilated convolution function                               */
+/*               Stride values = 1, 2 and 4 are supported                   */
+/*               Implementation also supports dilation >= 1 for stride = 1  */
+/*               and dilation = 1 for stride = 2, 4                         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is U8, CoeffData is S8                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 15.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               No edges along dimension 1 of inTile                       */
+/****************************************************************************/
+/* Although this routine supports IVP_MULSUQA2N8XR8, it has been intentionally disabled because we are not using it for the core that supports IVP_MULSUQA2N8XR8.
+   We will be using convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth_x4 and convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth.
+   These routines are faster than convolvedVQ3D_S_MxNdX_U8S8IXCa2_MOD_DWH */
+#ifndef IVP_MULSUQA2N8XR8
+#ifdef DILATED_VQ_CONV
+static _XAI_INLINE_ void convolvedVQ3D_S_MxNdX_U8S8IXCa2_noUnrollH_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void convolved3D_S_MxNdX_U8S8IXCa2_noUnrollH_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW      = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH      = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh   = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh  = XAI_TILE3D_GET_DIM1(outTile);
+  const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+  int32_t dilatedkWidth  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedkHeight = dilationY * (kHeightU - 1) + 1;
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  uint8_t *pInData   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+  xb_vecNx16U* restrict pOutScaleData;
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t leftEdge, topEdge;
+  if ((dilatedkWidth % 2) != 0)
+  {
+    leftEdge = dilatedkWidth / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedkWidth / 2) : ((dilatedkWidth / 2) - 1);
+  }
+
+  if ((dilatedkHeight % 2) != 0)
+  {
+    topEdge = dilatedkHeight / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedkHeight / 2) : ((dilatedkHeight / 2) - 1);
+  }
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  /* Vector data pointers */
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8U* restrict pdvecData1;
+  xb_vec2Nx8U* restrict pdvecData2;
+  xb_vec2Nx8U* restrict pdvecData3;
+  xb_vec2Nx8U* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+#ifndef IVP_MULSUQA2N8XR8
+  /* Vector data registers */
+  xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4;
+  xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8;
+  xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12;
+  xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16;
+  xb_vecNx16 vecData1, vecData2;
+  xb_vecNx16 vecData3, vecData4;
+  xb_vecNx16 vecData5, vecData6;
+  xb_vecNx16 vecData7, vecData8;
+  xb_vecNx16 vecTemp1, vecTemp2;
+
+  /* Custom select pattern for DSELs */
+  int16_t sel1       = (XCHAL_IVPN_SIMD_WIDTH << 8);
+  xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1);
+  int16_t sel2       = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1);
+  xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2);
+#endif
+
+  /* Loops Start */
+  for (y = 0; y < outH; y++) /* Image Height */
+  {
+    for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+    { /* walk across the kernels */
+      /* To handle corner case when number of output channels
+       * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+      int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+      xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+      /*Load output scale values*/
+      pOutScaleData = (xb_vecNx16U *) (pScale + outCh);
+      VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+      for (x = 0; x < outW; x += 4) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner cases */
+        int32_t enable2ndWidth = XT_SALT(1, outW - x);
+        int32_t enable3rdWidth = XT_SALT(2, outW - x);
+        int32_t enable4thWidth = XT_SALT(3, outW - x);
+
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        uint8_t *pData = ((uint8_t *) pInData + x * inDataPitch1 + y * inDataPitch2);
+        int8_t *pCoeff = pCoeffData + outCh;
+
+
+
+        xb_vecN_2x32v hvecInAddrOff    = 0;
+        xb_vecN_2x32v hvecCoeffAddrOff = 0;
+        xb_vecN_2x32v hvecLaneIdx      = 0;
+        int32_t inAddrOff, coeffAddrOff;
+
+        for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */
+        {
+          /* Condition checks performed to get the Input and Coefficient        */
+          /* Pointer Offsets after combining the Kernel Width and Height Loops  */
+          vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU);
+          /* hvecLaneIdx will be reset to zero after every kWidth */
+          hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2);
+          /* InPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2);
+          /* CoeffPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2);
+          /* Extracting Input and Coefficient address offsets */
+          inAddrOff        = IVP_EXTRN_2X32(hvecInAddrOff, 0);
+          coeffAddrOff     = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0);
+          hvecLaneIdx      = IVP_ADDN_2X32(hvecLaneIdx, 1);
+          hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2);
+          hvecInAddrOff    = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX);
+
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8U *) (pData + inAddrOff);
+          pdvecData2 = (xb_vec2Nx8U *) (pData + inAddrOff + inDataPitch1 * enable2ndWidth);
+          pdvecData3 = (xb_vec2Nx8U *) (pData + inAddrOff + inDataPitch1 * 2 * enable3rdWidth);
+          pdvecData4 = (xb_vec2Nx8U *) (pData + inAddrOff + inDataPitch1 * 3 * enable4thWidth);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff);
+
+          /* Priming input loads */
+          valign vaIn1 = IVP_LA2NX8U_PP(pdvecData1);
+          valign vaIn2 = IVP_LA2NX8U_PP(pdvecData2);
+          valign vaIn3 = IVP_LA2NX8U_PP(pdvecData3);
+          valign vaIn4 = IVP_LA2NX8U_PP(pdvecData4);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */
+          {
+            xb_vec2Nx8U dvecInData1, dvecInData2, dvecInData3, dvecInData4;
+            /* Aligning variable vector load of pixels */
+            IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, 4);
+            IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, 4);
+            IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, 4);
+            IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, 4);
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0);
+#else
+            /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */
+            IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1);
+            IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1);
+            IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2);
+            IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2);
+
+            /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */
+            /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */
+            dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+
+            /* De-interleave a b a b a b... and move to a a a a... and b b b b... */
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+#endif
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            /* Multiply unsigned x signed and accumulate to 24-bits */
+            IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, dvecCoeff4);
+#endif
+          }   /* End Input Channels */
+          /* Corner Case Handling if number of input channels not multiple of 4 */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh = numInCh - inCh;
+            vaIn1 = IVP_LA2NX8U_PP(pdvecData1);
+            xb_vec2Nx8U dvecInData1, dvecInData2, dvecInData3, dvecInData4;
+            /* Aligning variable vector load of pixels */
+            IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, remInCh);
+            IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, remInCh);
+            IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, remInCh);
+            IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, remInCh);
+
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0);
+#else
+            /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */
+            IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1);
+            IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1);
+            IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2);
+            IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2);
+
+            /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */
+            /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */
+            dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+
+            /* De-interleave a b a b a b... and move to a a a a... and b b b b... */
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+#endif
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            /* Multiply unsigned x signed and accumulate to 24-bits */
+            IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2);
+            IVP_MULUSA2NX8(daccSum1, dvecData9, dvecCoeff3);
+            IVP_MULUSA2NX8(daccSum2, dvecData11, dvecCoeff3);
+            IVP_MULUSA2NX8(daccSum3, dvecData13, dvecCoeff3);
+            IVP_MULUSA2NX8(daccSum4, dvecData15, dvecCoeff3);
+#endif
+          }    /* End Corner case handling */
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * enable2ndWidth) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 2 * enable3rdWidth) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 3 * enable4thWidth) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh * enable4thWidth);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable4thWidth);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+#endif
+
+#ifndef IVP_MULSUQA2N8XR8
+#ifdef DILATED_VQ_CONV
+static _XAI_INLINE_ void convolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_depth2X_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void convolved3D_S_MxN_U8S8IXCa2_noUnrollH_depth2X_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+  xb_vecNx16U* restrict pOutScaleData;
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t numIter = kWidthU * numInCh;
+
+  int32_t leftEdge, topEdge;
+  if ((kWidthU % 2) != 0)
+  {
+    leftEdge = kWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1);
+  }
+
+  if ((kHeightU % 2) != 0)
+  {
+    topEdge = kHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1);
+  }
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  uint16_t* restrict pData1;
+  uint16_t* restrict pData2;
+  uint16_t* restrict pData3;
+  uint16_t* restrict pData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  xb_vecNx16 vecData1, vecData2, vecData3, vecData4;
+  xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4;
+  xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8;
+  xb_vecNx16 vecTemp1, vecTemp2;
+
+  /* Loops Start */
+  for (y = 0; y < outH; y++) /* Image Height */
+  {
+    for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+    { /* walk across the kernels */
+      /* To handle corner case when number of output channels
+       * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+      int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+      xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+      /*Load output scale values*/
+      pOutScaleData = (xb_vecNx16U *) (pScale + outCh);
+      VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+      for (x = 0; x < outW; x += 4) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variables to handle corner cases */
+        int32_t enable2ndWidth = XT_SALT(1, outW - x);
+        int32_t enable3rdWidth = XT_SALT(2, outW - x);
+        int32_t enable4thWidth = XT_SALT(3, outW - x);
+
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = ((int8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2);
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pData1 = (uint16_t *) (pData + ky * inDataPitch2);
+          pData2 = (uint16_t *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * enable2ndWidth);
+          pData3 = (uint16_t *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * 2 * enable3rdWidth);
+          pData4 = (uint16_t *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * 3 * enable4thWidth);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+          for (k = 0; k < numIter; k += 2) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Load 2 bytes of input data */
+            IVP_LSRNX16U_XP(vecData1, pData1, 2);
+            IVP_LSRNX16U_XP(vecData2, pData2, 2);
+            IVP_LSRNX16U_XP(vecData3, pData3, 2);
+            IVP_LSRNX16U_XP(vecData4, pData4, 2);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+
+            /* De-interleave a b a b a b... and move to a a a a... and b b b b... */
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData1, vecData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData1 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData2 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData2, vecData2, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData3 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData4 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData3, vecData3, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData5 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData6 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData4, vecData4, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData7 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData8 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+
+            /* Multiply unsigned x signed and accumulate to 24-bits */
+            IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2);
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * enable2ndWidth) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 2 * enable3rdWidth) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 3 * enable4thWidth) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh * enable4thWidth);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable4thWidth);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+#endif
+
+/****************************************************************************/
+/* Description : further optimized function if dim1Size == dim1Pitch        */
+/*               of 3D convolution for handling                             */
+/*               cases where kwidth * numInch is a multiple of 4            */
+/****************************************************************************/
+#ifdef IVP_MULSUQA2N8XR8
+#ifdef DILATED_VQ_CONV
+static _XAI_INLINE_ void convolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void convolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  uint8_t *pInData   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t leftEdge, topEdge;
+
+  if ((kWidthU % 2) != 0)
+  {
+    leftEdge = kWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1);
+  }
+
+  if ((kHeightU % 2) != 0)
+  {
+    topEdge = kHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1);
+  }
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+  int32_t numIter  = kWidthU * numInCh;
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8U* restrict pdvecData1;
+  xb_vec2Nx8U* restrict pdvecData2;
+  xb_vec2Nx8U* restrict pdvecData3;
+  xb_vec2Nx8U* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (y = 0; y < outH; y++)
+  {
+    for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+    { /* walk across the kernels */
+      /* To handle corner case when number of output channels
+       * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+      int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+      xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+      /*Load output scale values*/
+      xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) (pScale + outCh);
+      VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+
+      for (x = 0; x < (outW - 3); x += 4) /* Image Width */
+      {                                   /* walk across the columns */
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2);
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2);
+          pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + strideX * inDataPitch1);
+          pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + 2 * strideX * inDataPitch1);
+          pdvecData4 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + 3 * strideX * inDataPitch1);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8U_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8U_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8U_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8U_PP(pdvecData4);
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+          for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8U dvecData4; IVP_LAV2NX8U_XP(dvecData4, vaData4, pdvecData4, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData4)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 3 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+      if (x < outW)
+      {
+        int32_t enable2ndWidth = XT_SALT(1, outW - x);
+        int32_t enable3rdWidth = XT_SALT(2, outW - x);
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2);
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2);
+          pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * enable2ndWidth);
+          pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + 2 * strideX * inDataPitch1 * enable3rdWidth);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8U_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8U_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8U_PP(pdvecData3);
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+          for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      }
+    }     /* End Output Channels */
+  }
+}
+#endif
+
+/****************************************************************************/
+/* Description : further optimized function if dim1Size == dim1Pitch        */
+/*               of 3D convolution                                          */
+/****************************************************************************/
+#ifdef IVP_MULSUQA2N8XR8
+#ifdef DILATED_VQ_CONV
+static _XAI_INLINE_ void convolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void convolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  uint8_t *pInData   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t numIter = kWidthU * numInCh;
+
+  int32_t leftEdge, topEdge;
+  if ((kWidthU % 2) != 0)
+  {
+    leftEdge = kWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1);
+  }
+
+  if ((kHeightU % 2) != 0)
+  {
+    topEdge = kHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1);
+  }
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8U* restrict pdvecData1;
+  xb_vec2Nx8U* restrict pdvecData2;
+  xb_vec2Nx8U* restrict pdvecData3;
+  xb_vec2Nx8U* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (y = 0; y < outH; y++)
+  {
+    for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+    { /* walk across the kernels */
+      /* To handle corner case when number of output channels
+       * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+      int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV
+      xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+      /*Load output scale values*/
+      xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) (pScale + outCh);
+      VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+      for (x = 0; x < outW - 3; x += 4) /* Image Width */
+      {                                 /* walk across the columns */
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2);
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2);
+          pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + strideX * inDataPitch1);
+          pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + 2 * strideX * inDataPitch1);
+          pdvecData4 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + 3 * strideX * inDataPitch1);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8U_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8U_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8U_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8U_PP(pdvecData4);
+
+          for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8U dvecData4; IVP_LAV2NX8U_XP(dvecData4, vaData4, pdvecData4, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData4)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          }   /* End Input Channels */
+
+          /* Corner case handling as numIter is not a multiple of 4 */
+          {
+            int32_t remInCh = numIter - k;
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, remInCh);
+            xb_vec2Nx8U dvecData4; IVP_LAV2NX8U_XP(dvecData4, vaData4, pdvecData4, remInCh);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData4)), 0);
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+
+            IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 3 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                       remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End image width */
+      if (x < outW)
+      {
+        int32_t enable2ndWidth = XT_SALT(1, outW - x);
+        int32_t enable3rdWidth = XT_SALT(2, outW - x);
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2);
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2);
+          pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * enable2ndWidth);
+          pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + 2 * strideX * inDataPitch1 * enable3rdWidth);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8U_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8U_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8U_PP(pdvecData3);
+
+          for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          }   /* End Input Channels */
+
+          /* Corner case handling as numIter is not a multiple of 4 */
+          {
+            int32_t remInCh = numIter - k;
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, remInCh);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0);
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+            IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      }
+    }     /* End Output Channels */
+  }
+}
+#endif
+
+#ifdef DILATED_VQ_CONV
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_U8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE, \
+                    "\nKernel height = %d, width = %d\nKernel width and height should be less than or equal to 64",  \
+                    XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile));
+    XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param);
+    XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) &&                                      \
+                    ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG,                    \
+                    "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \
+                    XAI_ERR_BADARG, "dilation parameter has to be >= 1");
+    XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_PITCH(inTile) == XAI_TILE3D_GET_DIM1(inTile)) \
+                      && XAI_CNN_CONV_GET_DILATION(param) == 1) || XAI_CNN_CONV_GET_DILATION(param) > 1),
+                    XAI_ERR_BADARG, "Edges along input channels is not supported if dilation = 1.");
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE,                                                      \
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+#ifndef DILATED_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+#ifdef IVP_MULSUQA2N8XR8 // only for Vision_130
+  if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && \
+      (XAI_CNN_CONV_GET_DILATIONX(param) == 1) && (XAI_CNN_CONV_GET_DILATIONY(param) == 1))
+  {
+    if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0)
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, outTile, param);
+#endif
+    }
+    else
+    {
+#ifdef DILATED_VQ_CONV
+      convolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(inTile, coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+      convolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(inTile, coeffTile, biasArray, outTile, param);
+#endif
+    }
+    return(XAI_ERROR_STATUS());
+  }
+#else // Vision_P6
+  if (XAI_CNN_CONV_GET_DILATIONX(param) > 1 && XAI_CNN_CONV_GET_DILATIONY(param) > 1)
+  {
+#ifdef DILATED_VQ_CONV
+    convolvedVQ3D_S_MxNdX_U8S8IXCa2_noUnrollH_MOD_DWH(inTile, \
+                                                      coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+    convolved3D_S_MxNdX_U8S8IXCa2_noUnrollH_MOD_DWH(inTile, \
+                                                    coeffTile, biasArray, outTile, param);
+#endif
+    return(XAI_ERROR_STATUS());
+  }
+  /* If number of input channels is a multiple of 2 &
+     the active data pointer is aligned to 2-bytes,
+     call a more optimal variant */
+  if ((XAI_CNN_CONV_GET_DILATIONX(param) == 1 && XAI_CNN_CONV_GET_DILATIONY(param) == 1) && \
+      (XAI_TILE3D_GET_DIM1(inTile) % 2) == 0                                                \
+      && ((XAI_PTR_TO_ADDR(XAI_TILE3D_GET_DATA_PTR(inTile)) & (2 - 1)) == 0))
+  {
+#ifdef DILATED_VQ_CONV
+    convolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_depth2X_MOD_DWH(inTile, \
+                                                            coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+    convolved3D_S_MxN_U8S8IXCa2_noUnrollH_depth2X_MOD_DWH(inTile, \
+                                                          coeffTile, biasArray, outTile, param);
+#endif
+    return(XAI_ERROR_STATUS());
+  }
+#endif
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW      = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH      = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh   = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh  = XAI_TILE3D_GET_DIM1(outTile);
+  const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+  int32_t dilatedKWidth  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeight = dilationY * (kHeightU - 1) + 1;
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  uint8_t *pInData   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV
+  uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+  xb_vecNx16U* restrict pOutScaleData;
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t leftEdge, topEdge;
+  if ((dilatedKWidth % 2) != 0)
+  {
+    leftEdge = dilatedKWidth / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidth / 2) : ((dilatedKWidth / 2) - 1);
+  }
+
+  if ((dilatedKHeight % 2) != 0)
+  {
+    topEdge = dilatedKHeight / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeight / 2) : ((dilatedKHeight / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  /* Vector data pointers */
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8U* restrict pdvecData1;
+  xb_vec2Nx8U* restrict pdvecData2;
+  xb_vec2Nx8U* restrict pdvecData3;
+  xb_vec2Nx8U* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  /* Loops Start */
+  for (y = 0; y < outH; y++)
+  {
+    for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */
+    {                                                                     /* walk across the kernels */
+      /* To handle corner case when number of output channels
+       * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+      int32_t remainingOutCh = (numOutCh - outCh);
+#ifdef DILATED_VQ_CONV
+      xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+      /*Load output scale values*/
+      pOutScaleData = (xb_vecNx16U *) (pScale + outCh);
+      VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+
+      for (x = 0; x < outW - 3; x += 4) /* Image Width */
+      {
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2);
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        xb_vecN_2x32v hvecInAddrOff    = 0;
+        xb_vecN_2x32v hvecCoeffAddrOff = 0;
+        xb_vecN_2x32v hvecLaneIdx      = 0;
+        int32_t inAddrOff              = 0, coeffAddrOff = 0;
+
+        for (k = 0; k < kWidthU * kHeightU; k++) /* Kernel Height * Kernel Width */
+        {
+          vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU);
+          /* hvecLaneIdx will be reset to zero after every kWidth */
+          hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2);
+          /* InPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2);
+          /* CoeffPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2);
+          /* Extracting Input and Coefficient address offsets */
+          inAddrOff        = IVP_EXTRN_2X32(hvecInAddrOff, 0);
+          coeffAddrOff     = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0);
+          hvecLaneIdx      = IVP_ADDN_2X32(hvecLaneIdx, 1);
+          hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2);
+          hvecInAddrOff    = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX);
+
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8U *) (pData + inAddrOff);
+          pdvecData2 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1);
+          pdvecData3 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * 2);
+          pdvecData4 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * 3);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff);
+
+          /* Primes registers for Aligning Load */
+          valign vaData1 = IVP_LA2NX8U_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8U_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8U_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8U_PP(pdvecData4);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecInData1; IVP_LAV2NX8U_XP(dvecInData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8U dvecInData2; IVP_LAV2NX8U_XP(dvecInData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8U dvecInData3; IVP_LAV2NX8U_XP(dvecInData3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8U dvecInData4; IVP_LAV2NX8U_XP(dvecInData4, vaData4, pdvecData4, 4);
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0);
+#else
+            xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4;
+            xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8;
+            xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12;
+            xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16;
+            xb_vecNx16 vecData1, vecData2;
+            xb_vecNx16 vecData3, vecData4;
+            xb_vecNx16 vecData5, vecData6;
+            xb_vecNx16 vecData7, vecData8;
+            xb_vecNx16 vecTemp1, vecTemp2;
+
+            /* Custom select pattern for DSELs */
+            int16_t sel1       = ((XCHAL_IVPN_SIMD_WIDTH << 8));
+            xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1);
+            int16_t sel2       = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1);
+            xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2);
+
+            /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */
+            IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1);
+            IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1);
+            IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2);
+            IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2);
+
+            /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */
+            /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */
+            dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+
+            /* De-interleave a b a b a b... and move to a a a a... and b b b b... */
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+#endif
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            /* Multiply unsigned x signed and accumulate to 24-bits */
+            IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, dvecCoeff4);
+#endif
+          }   /* End Input Channels */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh = numInCh - inCh;
+            vaData1 = IVP_LA2NX8U_PP(pdvecData1);
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecInData1; IVP_LAV2NX8U_XP(dvecInData1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8U dvecInData2; IVP_LAV2NX8U_XP(dvecInData2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8U dvecInData3; IVP_LAV2NX8U_XP(dvecInData3, vaData3, pdvecData3, remInCh);
+            xb_vec2Nx8U dvecInData4; IVP_LAV2NX8U_XP(dvecInData4, vaData4, pdvecData4, remInCh);
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0);
+#else
+            xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4;
+            xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8;
+            xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12;
+            xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16;
+            xb_vecNx16 vecData1, vecData2;
+            xb_vecNx16 vecData3, vecData4;
+            xb_vecNx16 vecData5, vecData6;
+            xb_vecNx16 vecData7, vecData8;
+            xb_vecNx16 vecTemp1, vecTemp2;
+
+            /* Custom select pattern for DSELs */
+            int16_t sel1       = ((XCHAL_IVPN_SIMD_WIDTH << 8));
+            xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1);
+            int16_t sel2       = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1);
+            xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2);
+
+            /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */
+            IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1);
+            IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1);
+            IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2);
+            IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2);
+
+            /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */
+            /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */
+            dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+
+            /* De-interleave a b a b a b... and move to a a a a... and b b b b... */
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+#endif
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Coefficient Loads */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            /* Multiply unsigned x signed and accumulate to 24-bits */
+            IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, 0);
+            IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, 0);
+            IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, 0);
+            IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, 0);
+#endif
+          }
+        }
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 3 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      }
+      if (x < outW)
+      {
+        int32_t enable2ndWidth = XT_SALT(1, outW - x);
+        int32_t enable3rdWidth = XT_SALT(2, outW - x);
+        /* Output Data pointer */
+        int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+        ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+
+        /* Input Data and Coeff Data Pointers */
+        uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2);
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        xb_vecN_2x32v hvecInAddrOff    = 0;
+        xb_vecN_2x32v hvecCoeffAddrOff = 0;
+        xb_vecN_2x32v hvecLaneIdx      = 0;
+        int32_t inAddrOff              = 0, coeffAddrOff = 0;
+
+        for (k = 0; k < kWidthU * kHeightU; k++) /* Kernel Height * Kernel Width */
+        {
+          vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU);
+          /* hvecLaneIdx will be reset to zero after every kWidth */
+          hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2);
+          /* InPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2);
+          /* CoeffPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2);
+          /* Extracting Input and Coefficient address offsets */
+          inAddrOff        = IVP_EXTRN_2X32(hvecInAddrOff, 0);
+          coeffAddrOff     = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0);
+          hvecLaneIdx      = IVP_ADDN_2X32(hvecLaneIdx, 1);
+          hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2);
+          hvecInAddrOff    = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX);
+
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8U *) (pData + inAddrOff);
+          pdvecData2 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * enable2ndWidth);
+          pdvecData3 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * 2 * enable3rdWidth);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff);
+
+          /* Primes registers for Aligning Load */
+          valign vaData1 = IVP_LA2NX8U_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8U_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8U_PP(pdvecData3);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecInData1; IVP_LAV2NX8U_XP(dvecInData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8U dvecInData2; IVP_LAV2NX8U_XP(dvecInData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8U dvecInData3; IVP_LAV2NX8U_XP(dvecInData3, vaData3, pdvecData3, 4);
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0);
+
+#else
+            xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4;
+            xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8;
+            xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12;
+            xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16;
+            xb_vecNx16 vecData1, vecData2;
+            xb_vecNx16 vecData3, vecData4;
+            xb_vecNx16 vecData5, vecData6;
+            xb_vecNx16 vecData7, vecData8;
+            xb_vecNx16 vecTemp1, vecTemp2;
+            xb_vec2Nx8U dvecInData4 = 0;
+
+            /* Custom select pattern for DSELs */
+            int16_t sel1       = ((XCHAL_IVPN_SIMD_WIDTH << 8));
+            xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1);
+            int16_t sel2       = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1);
+            xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2);
+
+            /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */
+            IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1);
+            IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1);
+            IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2);
+            IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2);
+
+            /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */
+            /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */
+            dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+
+            /* De-interleave a b a b a b... and move to a a a a... and b b b b... */
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+#endif
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+#else
+            /* Multiply unsigned x signed and accumulate to 24-bits */
+            IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, dvecCoeff4);
+            IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, dvecCoeff4);
+#endif
+          } /* End Input Channels */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh = numInCh - inCh;
+            vaData1 = IVP_LA2NX8U_PP(pdvecData1);
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecInData1; IVP_LAV2NX8U_XP(dvecInData1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8U dvecInData2; IVP_LAV2NX8U_XP(dvecInData2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8U dvecInData3; IVP_LAV2NX8U_XP(dvecInData3, vaData3, pdvecData3, remInCh);
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0);
+#else
+            xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4;
+            xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8;
+            xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12;
+            xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16;
+            xb_vecNx16 vecData1, vecData2;
+            xb_vecNx16 vecData3, vecData4;
+            xb_vecNx16 vecData5, vecData6;
+            xb_vecNx16 vecData7, vecData8;
+            xb_vecNx16 vecTemp1, vecTemp2;
+            xb_vec2Nx8U dvecInData4 = 0;
+
+            /* Custom select pattern for DSELs */
+            int16_t sel1       = ((XCHAL_IVPN_SIMD_WIDTH << 8));
+            xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1);
+            int16_t sel2       = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1);
+            xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2);
+
+            /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */
+            IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1);
+            IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1);
+            IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2);
+            IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2);
+
+            /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */
+            /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */
+            dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+            dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+            dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
+
+            /* De-interleave a b a b a b... and move to a a a a... and b b b b... */
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+            IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1);
+            dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2);
+#endif
+
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Coefficient Loads */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+#else
+            /* Multiply unsigned x signed and accumulate to 24-bits */
+            IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2);
+            IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, 0);
+            IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, 0);
+            IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, 0);
+            IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, 0);
+#endif
+          }
+        }
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H;
+#ifdef DILATED_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************* end of VQ MOD variants ***************************************/
+/**********************************************************************************************/
+#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD_S16.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD_S16.c
new file mode 100644
index 00000000000..ce103bda723
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD_S16.c
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+#include "limits.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#undef DILATED_VQ_CONV_S16
+
+#include "cnn_dilated_conv_MOD_S16.h"
+
+/******************************* end of MOD variants ***************************************/
+/*******************************************************************************************/
+#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD_S16.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD_S16.h
new file mode 100644
index 00000000000..8d2c920eb12
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD_S16.h
@@ -0,0 +1,708 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+#include "limits.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+/******************************************************************************************
+* MOD DWH variants
+******************************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized implementation for MxN MOD_DWH                */
+/*               3D convolution for S16for handling cases where             */
+/*               kwidth * numInch is not a multiple of 4                    */
+/*               Code implementation is generated during preprocessing stage*/
+/*               This method can be used to generate MxN MOD_DWH 3D         */
+/*               dilated convolution function and MxN MOD_DWH 3D VQ         */
+/*               dilated convolution function                               */
+/*               Stride values = 1, 2 and 4 are supported for dilation = 1  */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is S16, CoeffData is S16                            */
+/*               biasArray is signed 64b, value not exceeding signed 48b    */
+/*               Output scale array is U16                                  */
+/*               OutData is U16/S16                                         */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 15.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV_S16
+static _XAI_INLINE_ void convolvedVQ3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth(const xai_pTile3D inTile,
+                                                                                const xai_pTile4D coeffTile,
+                                                                                const xai_pArray biasArray,
+                                                                                const xai_pArray outputScaleArray,
+                                                                                xai_pTile3D outTile,
+                                                                                const xai_cnn_conv_params *param)
+#else
+static _XAI_INLINE_ void convolved3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth(const xai_pTile3D inTile,
+                                                                              const xai_pTile4D coeffTile,
+                                                                              const xai_pArray biasArray,
+                                                                              xai_pTile3D outTile,
+                                                                              const xai_cnn_conv_params *param)
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int16_t *pInData     = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int16_t *pOutData    = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int16_t *pCoeffData  = (int16_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int64_t *pBiasData64 = (int64_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+#ifdef DILATED_VQ_CONV_S16
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+  int32_t leftEdge, topEdge;
+  int32_t minLim, maxLim;
+
+  if ((kWidthU % 2) != 0)
+  {
+    leftEdge = kWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1);
+  }
+
+  if ((kHeightU % 2) != 0)
+  {
+    topEdge = kHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MIN : 0;
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX : USHRT_MAX;
+  }
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  int32_t numIter = kWidthU * numInCh;
+
+  xb_vec2Nx8 *restrict pdvecBias = (xb_vec2Nx8 *) (pBiasData64);
+  xb_vecN_2x32v* restrict phvecIn1;
+  xb_vecN_2x32v* restrict phvecIn2;
+  xb_vecN_2x32v* restrict phvecIn3;
+  xb_vecN_2x32v* restrict phvecIn4;
+
+  xb_vecNx16* restrict pvecCoeff;
+  xb_vecNx16* restrict pvecOut;
+
+  valign vaOutData = IVP_ZALIGN(), vaBias = IVP_LA2NX8_PP(pdvecBias);
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  XCHAL_IVPN_SIMD_WIDTH*/
+    xb_vecNx48 accBias48;
+    int32_t remainingOutCh = numOutCh - outCh;
+    ACC_INIT_BIAS64_MOD_ONEACC(pdvecBias, vaBias, remainingOutCh, accBias48);
+#ifdef DILATED_VQ_CONV_S16
+    xb_vecNx16U vecScaleData;
+    /*Load output scale values*/
+    valign vaScale = IVP_LANX16U_PP(pOutScaleData);
+    IVP_LAVNX16U_XP(vecScaleData, vaScale, pOutScaleData, 2 * remainingOutCh);
+#endif
+
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int16_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2);
+
+        /* Initialize accumulators with bias values */
+        xb_vecNx48 accSum1, accSum2, accSum3, accSum4;
+        accSum4 = accSum3 = accSum2 = accSum1 = accBias48;
+
+        /* Input Data and Coeff Data Pointers */
+        int16_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int16_t *pCoeff = pCoeffData + outCh;
+
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          phvecIn1 = (xb_vecN_2x32v *) (pData + ky * inDataPitch2);
+          phvecIn2 = (xb_vecN_2x32v *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * numX);
+          phvecIn3 = (xb_vecN_2x32v *) (pData + ky * inDataPitch2 + strideY * inDataPitch2 * numY);
+          phvecIn4 = (xb_vecN_2x32v *) (pData + ky * inDataPitch2 + (strideX * \
+                                                                     inDataPitch1 + strideY * inDataPitch2) * numX * numY);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LAN_2X32_PP(phvecIn1);
+          valign vaData2 = IVP_LAN_2X32_PP(phvecIn2);
+          valign vaData3 = IVP_LAN_2X32_PP(phvecIn3);
+          valign vaData4 = IVP_LAN_2X32_PP(phvecIn4);
+          /* Pointer for Coefficient Load */
+          pvecCoeff = (xb_vecNx16 *) (pCoeff + ky * coeffPitch3);
+
+          for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vecN_2x32v hvecData1; IVP_LAVN_2X32_XP(hvecData1, vaData1, phvecIn1, 8);
+            xb_vecN_2x32v hvecData2; IVP_LAVN_2X32_XP(hvecData2, vaData2, phvecIn2, 8);
+            xb_vecN_2x32v hvecData3; IVP_LAVN_2X32_XP(hvecData3, vaData3, phvecIn3, 8);
+            xb_vecN_2x32v hvecData4; IVP_LAVN_2X32_XP(hvecData4, vaData4, phvecIn4, 8);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vecNx16 vecCoeff1; IVP_L2UNX16_XP(vecCoeff1, pvecCoeff, 2 * coeffPitch1);
+            xb_vecNx16 vecCoeff2; IVP_L2UNX16_XP(vecCoeff2, pvecCoeff, 2 * coeffPitch1);
+            xb_vecNx16 vecCoeff3; IVP_L2UNX16_XP(vecCoeff3, pvecCoeff, 2 * coeffPitch1);
+            xb_vecNx16 vecCoeff4; IVP_L2UNX16_XP(vecCoeff4, pvecCoeff, 2 * coeffPitch1);
+
+            IVP_MULPAN16XR16(accSum1, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData1, 0));
+            IVP_MULPAN16XR16(accSum2, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData2, 0));
+            IVP_MULPAN16XR16(accSum3, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData3, 0));
+            IVP_MULPAN16XR16(accSum4, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData4, 0));
+
+            IVP_MULPAN16XR16(accSum1, vecCoeff4, vecCoeff3, IVP_EXTRN_2X32(hvecData1, 1));
+            IVP_MULPAN16XR16(accSum2, vecCoeff4, vecCoeff3, IVP_EXTRN_2X32(hvecData2, 1));
+            IVP_MULPAN16XR16(accSum3, vecCoeff4, vecCoeff3, IVP_EXTRN_2X32(hvecData3, 1));
+            IVP_MULPAN16XR16(accSum4, vecCoeff4, vecCoeff3, IVP_EXTRN_2X32(hvecData4, 1));
+          }   /* End Input Channels */
+          if (k < numIter)
+          {
+            int32_t remInCh = numIter - k;
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Aligning variable vector load of pixels */
+            xb_vecN_2x32v hvecData1; IVP_LAVN_2X32_XP(hvecData1, vaData1, phvecIn1, 2 * remInCh);
+            xb_vecN_2x32v hvecData2; IVP_LAVN_2X32_XP(hvecData2, vaData2, phvecIn2, 2 * remInCh);
+            xb_vecN_2x32v hvecData3; IVP_LAVN_2X32_XP(hvecData3, vaData3, phvecIn3, 2 * remInCh);
+            xb_vecN_2x32v hvecData4; IVP_LAVN_2X32_XP(hvecData4, vaData4, phvecIn4, 2 * remInCh);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vecNx16 vecCoeff1; IVP_L2UNX16_XP(vecCoeff1, pvecCoeff, 2 * coeffPitch1 * enable2);
+            xb_vecNx16 vecCoeff2; IVP_L2UNX16_XP(vecCoeff2, pvecCoeff, 2 * coeffPitch1 * enable3);
+            xb_vecNx16 vecCoeff3; IVP_L2UNX16_XP(vecCoeff3, pvecCoeff, 2 * coeffPitch1);
+
+            IVP_MULPAN16XR16(accSum1, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData1, 0));
+            IVP_MULPAN16XR16(accSum2, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData2, 0));
+            IVP_MULPAN16XR16(accSum3, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData3, 0));
+            IVP_MULPAN16XR16(accSum4, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData4, 0));
+
+            IVP_MULPAN16XR16(accSum1, 0, vecCoeff3, IVP_EXTRN_2X32(hvecData1, 1));
+            IVP_MULPAN16XR16(accSum2, 0, vecCoeff3, IVP_EXTRN_2X32(hvecData2, 1));
+            IVP_MULPAN16XR16(accSum3, 0, vecCoeff3, IVP_EXTRN_2X32(hvecData3, 1));
+            IVP_MULPAN16XR16(accSum4, 0, vecCoeff3, IVP_EXTRN_2X32(hvecData4, 1));
+          }
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vecNx16 vecOut1, vecOut2, vecOut3, vecOut4;
+#ifdef DILATED_VQ_CONV_S16
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut1, accSum1, packShiftAccU, \
+                                             vecScaleData, outShiftU, minLim, maxLim);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut2, accSum2, packShiftAccU, \
+                                             vecScaleData, outShiftU, minLim, maxLim);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut3, accSum3, packShiftAccU, \
+                                             vecScaleData, outShiftU, minLim, maxLim);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut4, accSum4, packShiftAccU, \
+                                             vecScaleData, outShiftU, minLim, maxLim);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1, accSum1, packShiftAccU, \
+                                          outScale, outShiftU, minLim, maxLim);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2, accSum2, packShiftAccU, \
+                                          outScale, outShiftU, minLim, maxLim);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3, accSum3, packShiftAccU, \
+                                          outScale, outShiftU, minLim, maxLim);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4, accSum4, packShiftAccU, \
+                                          outScale, outShiftU, minLim, maxLim);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pvecOut = (xb_vecNx16 *) (pOut + outCh);
+        IVP_SAVNX16_XP(vecOut1, vaOutData, pvecOut, 2 * remainingOutCh);
+        IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pvecOut = (xb_vecNx16 *) (pOut + (outCh + outDataPitch1) * numX);
+        IVP_SAVNX16_XP(vecOut2, vaOutData, pvecOut, 2 * remainingOutCh * numX);
+        IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pvecOut = (xb_vecNx16 *) (pOut + (outCh + outDataPitch2) * numY);
+        IVP_SAVNX16_XP(vecOut3, vaOutData, pvecOut, 2 * remainingOutCh * numY);
+        IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pvecOut = (xb_vecNx16 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY));
+        IVP_SAVNX16_XP(vecOut4, vaOutData, pvecOut, 2 * remainingOutCh * numX * numY);
+        IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for MxN MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate MxN MOD_DWH 3D         */
+/*               dilated convolution function and MxN MOD_DWH 3D VQ         */
+/*               dilated convolution function                               */
+/*               Stride values = 1, 2 and 4 are supported                   */
+/*               Implementation also supports dilation >= 1 for stride = 1  */
+/*               and dilation = 1 for stride = 2, 4                         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is S16, CoeffData is S16                            */
+/*               biasArray is signed 64b, value not exceeding signed 48b    */
+/*               Output scale array is S16                                  */
+/*               OutData is U16/S16                                         */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 15.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               No edges along dimension 1 of inTile                       */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV_S16
+XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile,
+                                                      const xai_pTile4D coeffTile,
+                                                      const xai_pArray biasArray,
+                                                      const xai_pArray outputScaleArray,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_conv_params *param)
+#else
+XAI_ERR_TYPE xaiConvolved3D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile,
+                                                    const xai_pTile4D coeffTile,
+                                                    const xai_pArray biasArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_conv_params *param)
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S16(inTile);
+    XAI_CHECK_TILE3D_I16(outTile);
+    XAI_CHECK_TILE4D_S16(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S64(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE,       \
+                    "\nKernel Width = %d, Kernel Height = %d\nKernel Width and Height should be less than or equal to 64", \
+                    XAI_TILE4D_GET_DIM3(coeffTile), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param);
+    XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) &&                                      \
+                    ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG,                    \
+                    "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) == 1) ||                                                          \
+                    ((XAI_CNN_CONV_GET_DILATION(param) >= 1) &&                                                         \
+                     (XAI_CNN_CONV_GET_STRIDEX(param) == 1) && (XAI_CNN_CONV_GET_STRIDEY(param) == 1)), XAI_ERR_BADARG, \
+                    "\nDilation = %hhu\nDilation should be 1. It can be greater than 1 only when stride is equal to 1", \
+                    XAI_CNN_CONV_GET_DILATION(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and Dilation along height = %hhu\n \
+                     Dilation along width should be equal to dilation along height.",
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 32,                                       \
+                    XAI_ERR_NORM, "\nAccumulator shift value = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                           \
+                    XAI_ERR_NORM, "\nOutput shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+
+#ifdef DILATED_VQ_CONV_S16
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE,                                                      \
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+  }
+
+#ifndef DILATED_VQ_CONV_S16
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+  /* Calling further optimized function if dim1Size == dim1Pitch */
+  if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && XAI_CNN_CONV_GET_DILATION(param) == 1)
+  {
+#ifdef DILATED_VQ_CONV_S16
+    convolvedVQ3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth(inTile, \
+                                                           coeffTile, biasArray, outputScaleArray, outTile, param);
+#else
+    convolved3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth(inTile, \
+                                                         coeffTile, biasArray, outTile, param);
+#endif
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU   = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU  = XAI_TILE4D_GET_DIM4(coeffTile);
+  const int32_t dilationU = XAI_CNN_CONV_GET_DILATION(param);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int16_t *pInData     = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int16_t *pOutData    = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int16_t *pCoeffData  = (int16_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int64_t *pBiasData64 = (int64_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+#ifdef DILATED_VQ_CONV_S16
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t dilatedKWidthU  = dilationU * (kWidthU - 1) + 1;
+  int32_t dilatedKHeightU = dilationU * (kHeightU - 1) + 1;
+  int32_t leftEdge, topEdge;
+  int32_t minLim, maxLim;
+
+  if ((dilatedKWidthU % 2) != 0)
+  {
+    leftEdge = dilatedKWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1);
+  }
+
+  if ((dilatedKHeightU % 2) != 0)
+  {
+    topEdge = dilatedKHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1);
+  }
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MIN : 0;
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX : USHRT_MAX;
+  }
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, k, inCh;
+  valign vaOutData = IVP_ZALIGN();
+
+  /* Vector data pointers */
+  xb_vec2Nx8 *restrict pdvecBias = (xb_vec2Nx8 *) (pBiasData64);
+  xb_vecN_2x32v* restrict phvecIn1;
+  xb_vecN_2x32v* restrict phvecIn2;
+  xb_vecN_2x32v* restrict phvecIn3;
+  xb_vecN_2x32v* restrict phvecIn4;
+  xb_vecNx16* restrict pvecCoeff;
+  xb_vecNx16* restrict pvecOut;
+
+  valign vaBias = IVP_LA2NX8_PP(pdvecBias);
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  XCHAL_IVPN_SIMD_WIDTH*/
+    xb_vecNx48 accBias48;
+    int32_t remainingOutCh = numOutCh - outCh;
+    ACC_INIT_BIAS64_MOD_ONEACC(pdvecBias, vaBias, remainingOutCh, accBias48);
+#ifdef DILATED_VQ_CONV_S16
+    xb_vecNx16U vecScaleData;
+    /*Load output scale values*/
+    valign vaScale = IVP_LANX16U_PP(pOutScaleData);
+    IVP_LAVNX16U_XP(vecScaleData, vaScale, pOutScaleData, 2 * remainingOutCh);
+#endif
+
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int16_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2);
+
+        /* Initialize accumulators with bias values */
+        xb_vecNx48 accSum1, accSum2, accSum3, accSum4;
+        accSum4 = accSum3 = accSum2 = accSum1 = accBias48;
+
+        /* Input Data and Coeff Data Pointers */
+        int16_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int16_t *pCoeff = pCoeffData + outCh;
+
+        xb_vecN_2x32v hvecInAddrOff    = 0;
+        xb_vecN_2x32v hvecCoeffAddrOff = 0;
+        xb_vecN_2x32v hvecLaneIdx      = 0;
+        int32_t inAddrOff, coeffAddrOff;
+
+        for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */
+        {
+          /* Condition checks performed to get the Input and Coefficient        */
+          /* Pointer Offsets after combining the Kernel Width and Height Loops  */
+          vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU);
+          /* hvecLaneIdx will be reset to zero after every kWidth */
+          hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2);
+          /* InPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationU - kWidthU * inDataPitch1 * dilationU, vbN_2);
+          /* CoeffPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2);
+          /* Extracting Input and Coefficient address offsets */
+          inAddrOff        = IVP_EXTRN_2X32(hvecInAddrOff, 0);
+          coeffAddrOff     = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0);
+          hvecLaneIdx      = IVP_ADDN_2X32(hvecLaneIdx, 1);
+          hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2);
+          hvecInAddrOff    = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationU);
+
+          /* Pointers for Input Data Loads */
+          phvecIn1 = (xb_vecN_2x32v *) (pData + inAddrOff);
+          phvecIn2 = (xb_vecN_2x32v *) (pData + inAddrOff + strideX * inDataPitch1 * numX);
+          phvecIn3 = (xb_vecN_2x32v *) (pData + inAddrOff + strideY * inDataPitch2 * numY);
+          phvecIn4 = (xb_vecN_2x32v *) (pData + inAddrOff + (strideX * \
+                                                             inDataPitch1 + strideY * inDataPitch2) * numX * numY);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LAN_2X32_PP(phvecIn1);
+          valign vaData2 = IVP_LAN_2X32_PP(phvecIn2);
+          valign vaData3 = IVP_LAN_2X32_PP(phvecIn3);
+          valign vaData4 = IVP_LAN_2X32_PP(phvecIn4);
+
+          /* Pointer for Coefficient Load */
+          pvecCoeff = (xb_vecNx16 *) (pCoeff + coeffAddrOff);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vecN_2x32v hvecData1; IVP_LAVN_2X32_XP(hvecData1, vaData1, phvecIn1, 8);
+            xb_vecN_2x32v hvecData2; IVP_LAVN_2X32_XP(hvecData2, vaData2, phvecIn2, 8);
+            xb_vecN_2x32v hvecData3; IVP_LAVN_2X32_XP(hvecData3, vaData3, phvecIn3, 8);
+            xb_vecN_2x32v hvecData4; IVP_LAVN_2X32_XP(hvecData4, vaData4, phvecIn4, 8);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vecNx16 vecCoeff1; IVP_L2UNX16_XP(vecCoeff1, pvecCoeff, 2 * coeffPitch1);
+            xb_vecNx16 vecCoeff2; IVP_L2UNX16_XP(vecCoeff2, pvecCoeff, 2 * coeffPitch1);
+            xb_vecNx16 vecCoeff3; IVP_L2UNX16_XP(vecCoeff3, pvecCoeff, 2 * coeffPitch1);
+            xb_vecNx16 vecCoeff4; IVP_L2UNX16_XP(vecCoeff4, pvecCoeff, 2 * coeffPitch1);
+
+            IVP_MULPAN16XR16(accSum1, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData1, 0));
+            IVP_MULPAN16XR16(accSum2, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData2, 0));
+            IVP_MULPAN16XR16(accSum3, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData3, 0));
+            IVP_MULPAN16XR16(accSum4, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData4, 0));
+
+            IVP_MULPAN16XR16(accSum1, vecCoeff4, vecCoeff3, IVP_EXTRN_2X32(hvecData1, 1));
+            IVP_MULPAN16XR16(accSum2, vecCoeff4, vecCoeff3, IVP_EXTRN_2X32(hvecData2, 1));
+            IVP_MULPAN16XR16(accSum3, vecCoeff4, vecCoeff3, IVP_EXTRN_2X32(hvecData3, 1));
+            IVP_MULPAN16XR16(accSum4, vecCoeff4, vecCoeff3, IVP_EXTRN_2X32(hvecData4, 1));
+          }   /* End Input Channels */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh = numInCh - inCh;
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Aligning variable vector load of pixels */
+            xb_vecN_2x32v hvecData1; IVP_LAVN_2X32_XP(hvecData1, vaData1, phvecIn1, 2 * remInCh);
+            xb_vecN_2x32v hvecData2; IVP_LAVN_2X32_XP(hvecData2, vaData2, phvecIn2, 2 * remInCh);
+            xb_vecN_2x32v hvecData3; IVP_LAVN_2X32_XP(hvecData3, vaData3, phvecIn3, 2 * remInCh);
+            xb_vecN_2x32v hvecData4; IVP_LAVN_2X32_XP(hvecData4, vaData4, phvecIn4, 2 * remInCh);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vecNx16 vecCoeff1; IVP_L2UNX16_XP(vecCoeff1, pvecCoeff, 2 * coeffPitch1 * enable2);
+            xb_vecNx16 vecCoeff2; IVP_L2UNX16_XP(vecCoeff2, pvecCoeff, 2 * coeffPitch1 * enable3);
+            xb_vecNx16 vecCoeff3; IVP_L2UNX16_XP(vecCoeff3, pvecCoeff, 2 * coeffPitch1);
+
+            IVP_MULPAN16XR16(accSum1, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData1, 0));
+            IVP_MULPAN16XR16(accSum2, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData2, 0));
+            IVP_MULPAN16XR16(accSum3, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData3, 0));
+            IVP_MULPAN16XR16(accSum4, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData4, 0));
+
+            IVP_MULPAN16XR16(accSum1, 0, vecCoeff3, IVP_EXTRN_2X32(hvecData1, 1));
+            IVP_MULPAN16XR16(accSum2, 0, vecCoeff3, IVP_EXTRN_2X32(hvecData2, 1));
+            IVP_MULPAN16XR16(accSum3, 0, vecCoeff3, IVP_EXTRN_2X32(hvecData3, 1));
+            IVP_MULPAN16XR16(accSum4, 0, vecCoeff3, IVP_EXTRN_2X32(hvecData4, 1));
+          }
+        } /* End Kernel Height * Width */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vecNx16 vecOut1, vecOut2, vecOut3, vecOut4;
+#ifdef DILATED_VQ_CONV_S16
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut1, accSum1, packShiftAccU, \
+                                             vecScaleData, outShiftU, minLim, maxLim);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut2, accSum2, packShiftAccU, \
+                                             vecScaleData, outShiftU, minLim, maxLim);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut3, accSum3, packShiftAccU, \
+                                             vecScaleData, outShiftU, minLim, maxLim);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut4, accSum4, packShiftAccU, \
+                                             vecScaleData, outShiftU, minLim, maxLim);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1, accSum1, packShiftAccU, \
+                                          outScale, outShiftU, minLim, maxLim);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2, accSum2, packShiftAccU, \
+                                          outScale, outShiftU, minLim, maxLim);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3, accSum3, packShiftAccU, \
+                                          outScale, outShiftU, minLim, maxLim);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4, accSum4, packShiftAccU, \
+                                          outScale, outShiftU, minLim, maxLim);
+#endif
+        /* Store the output dvecOut1 along the output depth */
+        pvecOut = (xb_vecNx16 *) (pOut + outCh);
+        IVP_SAVNX16_XP(vecOut1, vaOutData, pvecOut, 2 * remainingOutCh);
+        IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+        /* Store the output dvecOut2 along the output depth */
+        pvecOut = (xb_vecNx16 *) (pOut + (outCh + outDataPitch1) * numX);
+        IVP_SAVNX16_XP(vecOut2, vaOutData, pvecOut, 2 * remainingOutCh * numX);
+        IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+        /* Store the output dvecOut3 along the output depth */
+        pvecOut = (xb_vecNx16 *) (pOut + (outCh + outDataPitch2) * numY);
+        IVP_SAVNX16_XP(vecOut3, vaOutData, pvecOut, 2 * remainingOutCh * numY);
+        IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+        /* Store the output dvecOut4 along the output depth */
+        pvecOut = (xb_vecNx16 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY));
+        IVP_SAVNX16_XP(vecOut4, vaOutData, pvecOut, 2 * remainingOutCh * numX * numY);
+        IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************* end of VQ MOD variants ***************************************/
+/**********************************************************************************************/
+#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/
+
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW.c
new file mode 100644
index 00000000000..72b1f7f9655
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#define DILATED_VQ_CONV  VQ_FALSE
+
+#define INPUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_dilated_conv_MOW.h"
+#undef INPUT_DATA_TYPE
+
+#define INPUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_dilated_conv_MOW.h"
+#endif //if ((XCHAL_VISION_TYPE >= 6))
+
+
+
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW.h
new file mode 100644
index 00000000000..9ab6fa6015a
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW.h
@@ -0,0 +1,27240 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#define VQ_TRUE   1
+#define VQ_FALSE  0
+
+#undef MAKE_NAME_VQ
+#undef MAKE_ARGUMENTS
+#undef MAKE_PARAMS
+
+#if DILATED_VQ_CONV == VQ_TRUE
+
+#define MAKE_NAME_VQ(a, b)             a ## VQ ## b
+#define MAKE_ARGUMENTS(a, b, c, d, e)  (const xai_pTile3D a, const xai_pTile4D b, const xai_pArray c, const xai_pArray outputScaleArray, xai_pTile3D d, const xai_cnn_conv_params * e)
+#define MAKE_PARAMS(a, b, c, d, e)     (a, b, c, outputScaleArray, d, e)
+
+#elif DILATED_VQ_CONV == VQ_FALSE
+
+#define MAKE_NAME_VQ(a, b)             a ## b
+#define MAKE_ARGUMENTS(a, b, c, d, e)  (const xai_pTile3D a, const xai_pTile4D b, const xai_pArray c, xai_pTile3D d, const xai_cnn_conv_params * e)
+#define MAKE_PARAMS(a, b, c, d, e)     (a, b, c, d, e)
+#endif
+
+#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER_IDT, suffix)  name ## _ ## MORPH_FNAME_SPECIFIER_IDT ## suffix
+
+#if INPUT_DATA_TYPE == UNSIGNED8BIT
+
+#define MAKE_NAME(name, suffix)  MAKE_NAME_IMPL(name, U8, suffix)
+#define MORPH_IDT_CHECK              XAI_CHECK_TILE3D_U8
+#define MORPH_IDT_SCALAR             uint8_t
+#define MORPH_IDT_2Nx8               xb_vec2Nx8U
+#define MORPH_OP_PRIME_2Nx8          IVP_LA2NX8U_PP
+#define MORPH_OP_ALIGN_LOAD_2Nx8     IVP_LV2NX8U_XP
+#define MORPH_OP_LOAD_2Nx8           IVP_LA2NX8U_XP
+#define MORPH_OP_L2_2Nx8             IVP_L2U2NX8U_XP
+#define MORPH_OP_LOAD_2Nx8_IP        IVP_LA2NX8U_IP
+#define MORPH_OP_LOAD_2Nx8_VARIABLE  IVP_LAV2NX8U_XP
+#define MORPH_OP_MULA                IVP_MULUSA2N8XR16
+#define MORPH_OP_MUL4TA              IVP_MULUS4TA2N8XR8
+#define MORPH_OP_MULQA               IVP_MULUSQA2N8XR8
+#define MORPH_OP_MULPA               IVP_MULUSPA2N8XR16
+#define MORPH_OP_GATHER              IVP_GATHERANX8U
+#define MORPH_OP_GATHER_2Nx8_LOW     IVP_GATHERD2NX8U_L
+#define MORPH_OP_GATHER_2Nx8_HIGH    IVP_GATHERD2NX8U_H
+#define MORPH_OP_DSELI               IVP_DSEL2NX8UI
+#define MORPH_OP_SEL                 IVP_SEL2NX8U
+
+#elif INPUT_DATA_TYPE == SIGNED8BIT
+
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_2Nx8
+#undef MORPH_OP_PRIME_2Nx8
+#undef MORPH_OP_ALIGN_LOAD_2Nx8
+#undef MORPH_OP_LOAD_2Nx8_IP
+#undef MORPH_OP_LOAD_2Nx8_VARIABLE
+#undef MORPH_OP_LOAD_2Nx8
+#undef MORPH_OP_L2_2Nx8
+#undef MORPH_OP_MULA
+#undef MORPH_OP_MUL4TA
+#undef MORPH_OP_MULQA
+#undef MORPH_OP_MULPA
+#undef MORPH_OP_GATHER
+#undef MORPH_OP_GATHER_2Nx8_LOW
+#undef MORPH_OP_GATHER_2Nx8_HIGH
+#undef MORPH_OP_DSELI
+#undef MORPH_OP_SEL
+
+#define MAKE_NAME(name, suffix)  MAKE_NAME_IMPL(name, S8, suffix)
+#define MORPH_IDT_CHECK              XAI_CHECK_TILE3D_S8
+#define MORPH_IDT_SCALAR             int8_t
+#define MORPH_IDT_2Nx8               xb_vec2Nx8
+#define MORPH_OP_PRIME_2Nx8          IVP_LA2NX8_PP
+#define MORPH_OP_ALIGN_LOAD_2Nx8     IVP_LV2NX8_XP
+#define MORPH_OP_LOAD_2Nx8           IVP_LA2NX8_XP
+#define MORPH_OP_L2_2Nx8             IVP_L2U2NX8_XP
+#define MORPH_OP_LOAD_2Nx8_IP        IVP_LA2NX8_IP
+#define MORPH_OP_LOAD_2Nx8_VARIABLE  IVP_LAV2NX8_XP
+#define MORPH_OP_MULA                IVP_MULA2N8XR16
+#define MORPH_OP_MUL4TA              IVP_MUL4TA2N8XR8
+#define MORPH_OP_MULQA               IVP_MULQA2N8XR8
+#define MORPH_OP_MULPA               IVP_MULPA2N8XR16
+#define MORPH_OP_GATHER              IVP_GATHERANX8S
+#define MORPH_OP_GATHER_2Nx8_LOW     IVP_GATHERD2NX8_L
+#define MORPH_OP_GATHER_2Nx8_HIGH    IVP_GATHERD2NX8_H
+#define MORPH_OP_DSELI               IVP_DSEL2NX8I
+#define MORPH_OP_SEL                 IVP_SEL2NX8
+#endif
+
+/******************************************************************************************
+*   xaiConvolved(VQ)3D_S_1x1j1d1I8S8IX_MOW_WHD
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 1x1 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 1x1 3D MOW_WHD dilated convolution function */
+/*               and 1x1 3D VQ MOW_WHD dilated convolution function for U8    */
+/*               bit and S8 bit input data with input stride equal to 1       */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 1x1xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+/*********************************************************************************
+   convolved3D_S_1x1j1d1_S8S8IX_MOW_WHD_NOEDGE
+   convolved3D_S_1x1j1d1_U8S8IX_MOW_WHD_NOEDGE
+   convolvedVQ3D_S_1x1j1d1_S8S8IX_MOW_WHD_NOEDGE
+   convolvedVQ3D_S_1x1j1d1_U8S8IX_MOW_WHD_NOEDGE
+ * MOW no edge variant                                                            *
+ * If DataPitch1 = width for input and output tile                              *
+ **********************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_1x1j1d1), S8IX_MOW_WHD_NOEDGE) \
+  MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW          = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH          = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh       = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh      = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t inDataPitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+  const int32_t coeffPitch3   = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  uint8_t enableReLu          = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  const int32_t vectorizationWidth = 2 * XCHAL_IVPN_SIMD_WIDTH;
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  int32_t inCh, outCh, xy;
+  MORPH_IDT_2Nx8 *restrict pdvecIn1;
+  MORPH_IDT_2Nx8 *restrict pdvecIn2;
+  MORPH_IDT_2Nx8 *restrict pdvecIn3;
+  MORPH_IDT_2Nx8 *restrict pdvecIn4;
+  xb_vec2Nx8 * restrict pdvecOut;
+  xb_vec2Nx8 * restrict pdvecCoeff1;
+  xb_vec2Nx8 * restrict pdvecCoeff2;
+  xb_vec2Nx8 * restrict pdvecCoeff3;
+  xb_vec2Nx8 * restrict pdvecCoeff4;
+
+  /* There are no edges input and output width. Output width and
+   * height loops are combined. Input data is loaded continuously
+   * from the input WH plane and output is stored continuously in
+   * output WH plane.
+   * The overall design approach is split into 2 sections, one
+   * with aligned input data and the other with unaligned input data.
+   */
+  if (XAI_TILE3D_IS_ALIGNED_2NX8(inTile))
+  {
+    for (xy = 0; xy < outW * outH; xy += vectorizationWidth) /* Loop across Output width */
+    {
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[xy * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[xy];
+
+      /* initialize coeff and Bias data pointer */
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */
+      {
+        /* In order to handle odd depths*/
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+        int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+        /* Load the bias values corresponding to four output channels */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+        xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+        dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+        IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+        /* Coefficient and input pointers */
+        int8_t *pCoeff = &pCoeffData[outCh * coeffPitch3];
+        pdvecCoeff1 = (xb_vec2Nx8 *) pCoeff;
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh);
+        pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh);
+        pdvecIn1    = (MORPH_IDT_2Nx8 *) (pInput);
+        pdvecIn2    = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2);
+        pdvecIn3    = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+        pdvecIn4    = (MORPH_IDT_2Nx8 *) (pInput + 3 * inDataPitch2);
+
+        /* Priming Loads for Coefficients */
+        valign vaCoeff1 = IVP_LA2NX8_PP(pdvecCoeff1);
+        valign vaCoeff2 = IVP_LA2NX8_PP(pdvecCoeff2);
+        valign vaCoeff3 = IVP_LA2NX8_PP(pdvecCoeff3);
+        valign vaCoeff4 = IVP_LA2NX8_PP(pdvecCoeff4);
+
+        for (inCh = 0; inCh < numInCh - 3; inCh += 4)  /* Loop across input depth */
+        {
+          /* input vectors are read from 4 input depths at at time
+           * Scalar 32 bit coeff are extracted from the coeff vectors */
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+
+          /* Read vector input data from 1st depth */
+          MORPH_OP_ALIGN_LOAD_2Nx8(dvecData1, pdvecIn1, 4 * inDataPitch2);
+
+          /* Read vector input data from 2nd depth */
+          MORPH_OP_ALIGN_LOAD_2Nx8(dvecData2, pdvecIn2, 4 * inDataPitch2);
+
+          /* Read vector input data from 3rd depth */
+          MORPH_OP_ALIGN_LOAD_2Nx8(dvecData3, pdvecIn3, 4 * inDataPitch2);
+
+          /* Read vector input data from 4th depth */
+          MORPH_OP_ALIGN_LOAD_2Nx8(dvecData4, pdvecIn4, 4 * inDataPitch2);
+
+          xb_vec2Nx8 dvecCoeff1, dvecCoeff2, dvecCoeff3, dvecCoeff4;
+          IVP_LAV2NX8_XP(dvecCoeff1, vaCoeff1, pdvecCoeff1, 4);
+          IVP_LAV2NX8_XP(dvecCoeff2, vaCoeff2, pdvecCoeff2, 4);
+          IVP_LAV2NX8_XP(dvecCoeff3, vaCoeff3, pdvecCoeff3, 4);
+          IVP_LAV2NX8_XP(dvecCoeff4, vaCoeff4, pdvecCoeff4, 4);
+
+          int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                            (IVP_MOVNX16_FROM2NX8(dvecCoeff1)), 0);
+          int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                            (IVP_MOVNX16_FROM2NX8(dvecCoeff2)), 0);
+          int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                            (IVP_MOVNX16_FROM2NX8(dvecCoeff3)), 0);
+          int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                            (IVP_MOVNX16_FROM2NX8(dvecCoeff4)), 0);
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1);
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2);
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3);
+          MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4);
+        } /* end of for (inCh = 0; inCh < numInCh - 3; inCh += 4)*/
+
+        /* Corner case handling if number of inCh is not a multiple of 4 */
+        if (inCh < numInCh)
+        {
+          int32_t remInCh = numInCh - inCh;
+
+          /* input vectors are read from 4 input depths at at time
+           * Scalar 32 bit coeff are extracted from the coeff vectors */
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3;
+
+          /* Read vector input data from 1st depth */
+          MORPH_OP_ALIGN_LOAD_2Nx8(dvecData1, pdvecIn1, inDataPitch2 * XT_SALT(1, remInCh));
+
+          /* Read vector input data from 2nd depth */
+          MORPH_OP_ALIGN_LOAD_2Nx8(dvecData2, pdvecIn1, inDataPitch2 * XT_SALT(2, remInCh));
+
+          /* Read vector input data from 3rd depth */
+          MORPH_OP_ALIGN_LOAD_2Nx8(dvecData3, pdvecIn1, 0);
+
+          xb_vec2Nx8 dvecCoeff1, dvecCoeff2, dvecCoeff3, dvecCoeff4;
+          IVP_LAV2NX8_XP(dvecCoeff1, vaCoeff1, pdvecCoeff1, remInCh);
+          IVP_LAV2NX8_XP(dvecCoeff2, vaCoeff2, pdvecCoeff2, remInCh);
+          IVP_LAV2NX8_XP(dvecCoeff3, vaCoeff3, pdvecCoeff3, remInCh);
+          IVP_LAV2NX8_XP(dvecCoeff4, vaCoeff4, pdvecCoeff4, remInCh);
+
+          int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                            (IVP_MOVNX16_FROM2NX8(dvecCoeff1)), 0);
+          int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                            (IVP_MOVNX16_FROM2NX8(dvecCoeff2)), 0);
+          int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                            (IVP_MOVNX16_FROM2NX8(dvecCoeff3)), 0);
+          int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                            (IVP_MOVNX16_FROM2NX8(dvecCoeff4)), 0);
+
+          MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1);
+          MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2);
+          MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3);
+          MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4);
+        } /* end of corner case handling*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable store count */
+        int32_t varLen = outW * outH - xy;
+
+        /* store output to 1st output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData; vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* store output to 2nd output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* store output to 3rd output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable3rdCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* store output to 4th output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable4thCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 4 * outDataPitch2 * bytesPerPixel;
+      } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/
+    }   /* end of for (xy = 0; xy < outW*outH; xy += vectorizationWidth)*/
+  }
+  else
+  {
+#ifdef __XCC__
+    XT_MEMW();   /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+    /* generate the sequence 0,1,2,3,0 + coeffPitch3, 1 + coeffPitch3,
+     * 2 + coeffPitch3, 3 + coeffPitch3, 0 + 2 * coeffPitch3, 1 + 2 * coeffPitch3,
+     * 2 + 2 * coeffPitch3, 3 + 2 * coeffPitch3, .....
+     *
+     * for e.g, if coeffPitch3 is 32:
+     * 0,1,2,3,32,33,34,35,64,65,66,67,96,96,98,99,..
+     *
+     * This sequence is used to gather coeff from 4 diff output channels, 4 each from
+     * every channel corresponding to 4 i/p channels, as innermost loop(inCh) is unrolled by
+     * 4 to make use of quad multipler.
+     */
+    xb_vecNx16U vecIdx1 = IVP_SEQNX16();
+    vecIdx1 = IVP_PACKVRNRNX48(IVP_MULNX16(vecIdx1, coeffPitch3), 0);
+    vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 1), vecIdx1, IVP_SELI_16B_INTERLEAVE_1_LO);
+    vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 2), vecIdx1, IVP_SELI_32B_INTERLEAVE_1_LO);
+    xb_gsr gs0;
+
+    for (xy = 0; xy < outW * outH; xy += vectorizationWidth) /* Loop across Output width * Outputheight */
+    {
+      xb_vecNx16U vecIdx2;
+      /* variable store count */
+      int32_t varLen = outW * outH - xy;
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[xy * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[xy];
+
+      /* initialize  Bias data pointer */
+      int32_t *pBias = &pBiasData[0];
+      int8_t *pCoeff = &pCoeffData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */
+      {
+        /* In order to handle odd depths*/
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+        int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+        /* load and replicate bias data for each output channel */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+        xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+        dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+        IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+        /* boolean mask to gather coeffs, if all the four o/p channels
+         * are present 16 coeff are loaded.
+         */
+        vboolN mask = IVP_LTNX16(IVP_SEQNX16(), (xb_vecNx16) XT_MIN(((numOutCh - outCh) * 4), 16));
+        /* Assign valid address for predicated false lines */
+        vecIdx2  = IVP_MOVNX16UT(vecIdx1, 0, mask);
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+        pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+
+        for (inCh = 0; inCh < numInCh - 3; inCh += 4)        /* Loop across input depth */
+        {
+          /* input vectors are read from 4 input depths at at time
+           * Scalar 32 bit coeff are extracted from the coeff vectors */
+
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+
+          /* Read vector input data from 1st depth */
+          valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2);
+
+          /* Read vector input data from 2nd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, 3 * inDataPitch2);
+
+          /* Read vector input data from 3rd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn2, inDataPitch2);
+
+          /* Read vector input data from 4th depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecData4, vaInData, pdvecIn2, 3 * inDataPitch2);
+
+          /* gather the coeffs */
+          gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2);
+          xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0);
+
+          /* extract scalar coeff from coeff vectors */
+          int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */
+          int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */
+          int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */
+          int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1);
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2);
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3);
+          MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4);
+        } /* end of for (inCh = 0; inCh < numInCh; inCh += 4)*/
+        if (inCh < numInCh)
+        {
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3;
+
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (((int8_t *) pdvecIn1) + 2 * inDataPitch2 * XT_SALT(inCh, numInCh - 2));
+          /* Read vector input data from 1st depth */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 1));
+
+          /* Read vector input data from 2nd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LAV2NX8_XP(dvecData2, vaInData, pdvecIn1, varLen * XT_SALT(inCh, numInCh - 1));
+
+          /* Read vector input data from 3rd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          IVP_LAV2NX8_XP(dvecData3, vaInData, pdvecIn2, varLen * XT_SALT(inCh, numInCh - 2));
+
+          /* Boolean mask for gather to handle cases where inCh<4 */
+          vboolN mask1 = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), (numInCh - inCh));
+          /* Assign valid address for predicated false lines */
+          vecIdx2 = IVP_MOVNX16UT(vecIdx2, 0, mask1);
+          /* Gather coeffs */
+          gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2);
+          xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0);
+
+          /* extract scalar coeff from coeff vectors */
+          int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */
+          int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */
+          int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */
+          int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */
+
+          MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1);
+          MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2);
+          MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3);
+          MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4);
+        } /* end of if (inCh < numInCh)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* store output to 1st output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData; vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* store output to 2nd output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* store output to 3rd output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable3rdCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* store output to 4th output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable4thCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 4 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 4 * coeffPitch3;
+      } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/
+    }   /* end of for (xy = 0; xy < outW*outH; xy += vectorizationWidth)*/
+  }
+}
+
+/******************************************************************************************
+* MOW fold 16 Stride 1 varaint                                                            *
+* If inDataPitch1 is lesser than or equal to                                              *
+* 16 this function is called.                                                             *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_1x1j1d1), S8IX_MOW_WHD_FOLD16) \
+  MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW          = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH          = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh       = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh      = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t inDataPitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+  const int32_t coeffPitch3   = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  uint8_t enableReLu          = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  int32_t inCh, outCh, y;
+  MORPH_IDT_2Nx8 *restrict pdvecIn1;
+  MORPH_IDT_2Nx8 *restrict pdvecIn2;
+
+  xb_vec2Nx8 * restrict pdvecOut;
+
+  xb_vecN_2x32v * restrict phvecCoeff1;
+
+
+  /* there are 2 implementations, one for
+   * input channels less than or equal to 64, and other for input channels
+   * greater than 64.
+   * Adding one more loop to support more than 64 input channels is causing
+   * significant overhead and degrades the the performance.
+   */
+
+  if ((numInCh <= 2 * XCHAL_IVPN_SIMD_WIDTH))
+  {
+    for (y = 0; y < outH; y += 4) /* Loop across Output height */
+    {
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y];
+
+      /* initialize coeff and Bias data pointer */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */
+      {
+        /* In order to handle odd depths*/
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+        int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+        /* Load the bias values corresponding to two output channels */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+        xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+        dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+        IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+
+        /* variables for coeff loads */
+        xb_vecN_2x32v hvecCoeffData1, hvecCoeffData2, hvecCoeffData3, hvecCoeffData4;
+
+        /* read coeff vectors , for 4 consecutive output depths */
+        /* coeff vector for 1st output channel */
+        phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff);
+        valign vaCoeffData; vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1);
+        IVP_LAVN_2X32_XP(hvecCoeffData1, vaCoeffData, phvecCoeff1, coeffPitch3);
+
+        /* coeff vector for 2nd output channel */
+        vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1);
+        IVP_LAVN_2X32_XP(hvecCoeffData2, vaCoeffData, phvecCoeff1, coeffPitch3 * enable2ndCh);
+
+        /* coeff vector for 3rd output channel */
+        vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1);
+        IVP_LAVN_2X32_XP(hvecCoeffData3, vaCoeffData, phvecCoeff1, coeffPitch3 * enable3rdCh);
+
+        /* coeff vector for 4th output channel */
+        vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1);
+        IVP_LAVN_2X32_XP(hvecCoeffData4, vaCoeffData, phvecCoeff1, coeffPitch3 * enable4thCh);
+
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+        pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+
+        for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Loop across input depth */
+        {
+          /* input vectors are read from 4 input depths at at time
+           * Scalar 32 bit coeff are extracted from the coeff vectors */
+
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+
+          /* Read vector input data from 1st depth */
+          valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2);
+
+          /* Read vector input data from 2nd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, 3 * inDataPitch2);
+
+          /* Read vector input data from 3rd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn2, inDataPitch2);
+
+          /* Read vector input data from 4th depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecData4, vaInData, pdvecIn2, 3 * inDataPitch2);
+
+          /* extract scalar coeff from coeff vectors */
+          int32_t coeff1 = IVP_EXTRVRN_2X32(hvecCoeffData1, inCh); /* 1st o/p depth coeff */
+          int32_t coeff2 = IVP_EXTRVRN_2X32(hvecCoeffData2, inCh); /* 2nd o/p depth coeff */
+          int32_t coeff3 = IVP_EXTRVRN_2X32(hvecCoeffData3, inCh); /* 3rd o/p depth coeff */
+          int32_t coeff4 = IVP_EXTRVRN_2X32(hvecCoeffData4, inCh); /* 4th o/p depth coeff */
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1);
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2);
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3);
+          MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4);
+        } /* end of for (inCh = 0; inCh < numInCh; inCh += 4)*/
+          /* Corner case handling if number of inCh is not a multiple of 4 */
+        if (inCh < numInCh)
+        {
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3;
+
+          /* Read vector input data from 1st depth */
+          valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 1));
+
+          /* Read vector input data from 2nd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 2));
+
+          /* Read vector input data from 3rd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn1, inDataPitch2);
+
+
+          /* extract scalar coeff from coeff vectors */
+          int32_t coeff1 = IVP_EXTRVRN_2X32(hvecCoeffData1, inCh); /* 1st o/p depth coeff */
+          int32_t coeff2 = IVP_EXTRVRN_2X32(hvecCoeffData2, inCh); /* 2nd o/p depth coeff */
+          int32_t coeff3 = IVP_EXTRVRN_2X32(hvecCoeffData3, inCh); /* 3rd o/p depth coeff */
+          int32_t coeff4 = IVP_EXTRVRN_2X32(hvecCoeffData4, inCh); /* 4th o/p depth coeff */
+
+          MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1);
+          MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2);
+          MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3);
+          MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4);
+        }
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* In order to handle odd depths*/
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+        int32_t enable3rdRow = XT_SALT(y, outH - 2);
+        int32_t enable4thRow = XT_SALT(y, outH - 3);
+
+        /* Storing the first output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                       vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first output depth, 3rd row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch1 * enable3rdRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \
+                       vaOutData, pdvecOut, bytesPerPixel * enable3rdRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first output depth, 4th row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch1 * enable4thRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), \
+                       vaOutData, pdvecOut, bytesPerPixel * enable4thRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \
+                       enable2ndRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, 3rd row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              2 * outDataPitch1 * enable3rdRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \
+                       enable3rdRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, 4th row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              3 * outDataPitch1 * enable4thRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \
+                       enable4thRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 3rd output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 3rd output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut3H, dvecOut3L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * \
+                       enable2ndRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 3rd output depth, 3rd row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              2 * outDataPitch1 * enable3rdRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut3H, dvecOut3L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * \
+                       enable3rdRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 3rd output depth, 4th row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              3 * outDataPitch1 * enable4thRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut3H, dvecOut3L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * \
+                       enable4thRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 4th output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable4thCh * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 4th output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut4H, dvecOut4L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable4thCh * \
+                       enable2ndRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 4th output depth, 3rd row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              2 * outDataPitch1 * enable3rdRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut4H, dvecOut4L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable4thCh * \
+                       enable3rdRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 4th output depth, 4th row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              3 * outDataPitch1 * enable4thRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut4H, dvecOut4L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable4thCh * \
+                       enable4thRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 4 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 4 * coeffPitch3;
+      } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/
+    }   /* end of for (y = 0; y < outH; y++)*/
+  }     /* end of if ((numInCh <= 2*XCHAL_IVPN_SIMD_WIDTH))*/
+  else
+  {
+#ifdef __XCC__
+    XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+    /* generate the sequence 0,1,2,3,0 + coeffPitch3, 1 + coeffPitch3,
+     * 2 + coeffPitch3, 3 + coeffPitch3, 0 + 2 * coeffPitch3, 1 + 2 * coeffPitch3,
+     * 2 + 2 * coeffPitch3, 3 + 2 * coeffPitch3, .....
+     *
+     * for e.g, if coeffPitch3 is 32:
+     * 0,1,2,3,32,33,34,35,64,65,66,67,96,96,98,99,..
+     *
+     * This sequence is used to gather coeff from 4 diff output channels, 4 each from
+     * every channel corresponding to 4 i/p channels, as innermost loop(inCh) is unrolled by
+     * 4 to make use of quad multipler.
+     */
+    xb_vecNx16U vecIdx1 = IVP_SEQNX16();
+    vecIdx1 = IVP_PACKVRNRNX48(IVP_MULNX16(vecIdx1, coeffPitch3), 0);
+    vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 1), vecIdx1, IVP_SELI_16B_INTERLEAVE_1_LO);
+    vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 2), vecIdx1, IVP_SELI_32B_INTERLEAVE_1_LO);
+    xb_gsr gs0;
+
+    xb_vecNx16U vecIdx2;
+    for (y = 0; y < outH; y += 4)        /* Loop across Output height */
+    {
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y];
+
+      /* initialize  Bias data pointer */
+
+      int32_t *pBias = &pBiasData[0];
+      int8_t *pCoeff = &pCoeffData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 4)  /* Loop across Output depth */
+      {
+        /* In order to handle odd depths*/
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+        int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+        /* Load the bias values corresponding to two output channels */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+        xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+        dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+        IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+        /* boolean mask to gather coeffs, if all the four o/p channels
+         * are present 16 coeff are loaded.
+         */
+        vboolN mask = IVP_LTNX16(IVP_SEQNX16(), (xb_vecNx16) XT_MIN(((numOutCh - outCh) * 4), 16));
+        /* Assign valid address for predicated false lines */
+        vecIdx2  = IVP_MOVNX16UT(vecIdx1, 0, mask);
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+        pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+
+        for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Loop across input depth */
+        {
+          /* input vectors are read from 4 input depths at at time
+           * Scalar 32 bit coeff are extracted from the coeff vectors */
+
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+
+          /* Read vector input data from 1st depth */
+          valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2);
+
+          /* Read vector input data from 2nd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, 3 * inDataPitch2);
+
+          /* Read vector input data from 3rd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn2, inDataPitch2);
+
+          /* Read vector input data from 4th depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecData4, vaInData, pdvecIn2, 3 * inDataPitch2);
+
+          /* gather the coeffs */
+          gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2);
+          xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0);
+
+          /* extract scalar coeff from coeff vectors */
+          int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */
+          int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */
+          int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */
+          int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1);
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2);
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3);
+          MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4);
+        }       /* end of for (inCh = 0; inCh < numInCh; inCh += 4)*/
+        if (inCh < numInCh)
+        {
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3;
+
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (((int8_t *) pdvecIn1) + 2 * inDataPitch2 * XT_SALT(inCh, numInCh - 2));
+          /* Read vector input data from 1st depth */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 1));
+
+          /* Read vector input data from 2nd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LAV2NX8_XP(dvecData2, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * XT_SALT(inCh, numInCh - 1));
+
+          /* Read vector input data from 3rd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          IVP_LAV2NX8_XP(dvecData3, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * XT_SALT(inCh, numInCh - 2));
+
+          /* Boolean mask for gather to handle cases where inCh<4 */
+          vboolN mask1 = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), (numInCh - inCh));
+          /* Assign valid address for predicated false lines */
+          vecIdx2 = IVP_MOVNX16UT(vecIdx2, 0, mask1);
+          /* Gather coeffs */
+          gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2);
+          xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0);
+
+          /* extract scalar coeff from coeff vectors */
+          int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */
+          int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */
+          int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */
+          int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */
+
+          MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1);
+          MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2);
+          MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3);
+          MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4);
+        } /* end of if (inCh < numInCh)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* In order to handle odd depths*/
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+        int32_t enable3rdRow = XT_SALT(y, outH - 2);
+        int32_t enable4thRow = XT_SALT(y, outH - 3);
+
+        /* Storing the first output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                       vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first output depth, 3rd row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch1 * enable3rdRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \
+                       vaOutData, pdvecOut, bytesPerPixel * enable3rdRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first output depth, 4th row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch1 * enable4thRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), \
+                       vaOutData, pdvecOut, bytesPerPixel * enable4thRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \
+                       enable2ndRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, 3rd row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              2 * outDataPitch1 * enable3rdRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \
+                       enable3rdRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, 4th row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              3 * outDataPitch1 * enable4thRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \
+                       enable4thRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 3rd output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 3rd output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut3H, dvecOut3L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * \
+                       enable2ndRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 3rd output depth, 3rd row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              2 * outDataPitch1 * enable3rdRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut3H, dvecOut3L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * \
+                       enable3rdRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 3rd output depth, 4th row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              3 * outDataPitch1 * enable4thRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut3H, dvecOut3L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * \
+                       enable4thRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 4th output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable4thCh * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 4th output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut4H, dvecOut4L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable4thCh * \
+                       enable2ndRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 4th output depth, 3rd row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              2 * outDataPitch1 * enable3rdRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut4H, dvecOut4L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable4thCh * \
+                       enable3rdRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 4th output depth, 4th row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              3 * outDataPitch1 * enable4thRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut4H, dvecOut4L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable4thCh * \
+                       enable4thRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 4 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 4 * coeffPitch3;
+      } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/
+    }   /* end of for (y = 0; y < outH; y++)*/
+  }     /* end of else part of if ((numInCh <= 2*XCHAL_IVPN_SIMD_WIDTH))*/
+}
+
+
+/******************************************************************************************
+* MOW fold 32 Stride 1 varaint                                                            *
+* If inDataPitch1 is lesser than or equal to                                              *
+* 16 this function is called.                                                             *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_1x1j1d1), S8IX_MOW_WHD_FOLD32) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW          = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH          = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh       = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh      = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t inDataPitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+  const int32_t coeffPitch3   = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  uint8_t enableReLu          = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  int32_t inCh, outCh, y;
+  MORPH_IDT_2Nx8 *restrict pdvecIn1;
+  MORPH_IDT_2Nx8 *restrict pdvecIn2;
+
+  xb_vec2Nx8 * restrict pdvecOut;
+
+  xb_vecN_2x32v * restrict phvecCoeff1;
+
+
+  /* there are 2 implementations, one for
+   * input channels less than or equal to 64, and other for input channels
+   * greater than 64.
+   * Adding one more loop to support more than 64 input channels is causing
+   * significant overhead and degrades the the performance.
+   */
+
+  if ((numInCh <= 2 * XCHAL_IVPN_SIMD_WIDTH))
+  {
+    for (y = 0; y < outH; y += 2) /* Loop across Output height */
+    {
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y];
+
+      /* initialize coeff and Bias data pointer */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */
+      {
+        /* In order to handle odd depths*/
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+        int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+        /* Load the bias values corresponding to two output channels */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+        xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+        dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+        IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+
+        /* variables for coeff loads */
+        xb_vecN_2x32v hvecCoeffData1, hvecCoeffData2, hvecCoeffData3, hvecCoeffData4;
+
+        /* read coeff vectors , for 4 consecutive output depths */
+        /* coeff vector for 1st output channel */
+        phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff);
+        valign vaCoeffData; vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1);
+        IVP_LAVN_2X32_XP(hvecCoeffData1, vaCoeffData, phvecCoeff1, coeffPitch3);
+
+        /* coeff vector for 2nd output channel */
+        vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1);
+        IVP_LAVN_2X32_XP(hvecCoeffData2, vaCoeffData, phvecCoeff1, coeffPitch3 * enable2ndCh);
+
+        /* coeff vector for 3rd output channel */
+        vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1);
+        IVP_LAVN_2X32_XP(hvecCoeffData3, vaCoeffData, phvecCoeff1, coeffPitch3 * enable3rdCh);
+
+        /* coeff vector for 4th output channel */
+        vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1);
+        IVP_LAVN_2X32_XP(hvecCoeffData4, vaCoeffData, phvecCoeff1, coeffPitch3 * enable4thCh);
+
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+        pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+
+        for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Loop across input depth */
+        {
+          /* input vectors are read from 4 input depths at at time
+           * Scalar 32 bit coeff are extracted from the coeff vectors */
+
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+
+          /* Read vector input data from 1st depth */
+          valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2);
+
+          /* Read vector input data from 2nd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, 3 * inDataPitch2);
+
+          /* Read vector input data from 3rd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn2, inDataPitch2);
+
+          /* Read vector input data from 4th depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecData4, vaInData, pdvecIn2, 3 * inDataPitch2);
+
+          /* extract scalar coeff from coeff vectors */
+          int32_t coeff1 = IVP_EXTRVRN_2X32(hvecCoeffData1, inCh); /* 1st o/p depth coeff */
+          int32_t coeff2 = IVP_EXTRVRN_2X32(hvecCoeffData2, inCh); /* 2nd o/p depth coeff */
+          int32_t coeff3 = IVP_EXTRVRN_2X32(hvecCoeffData3, inCh); /* 3rd o/p depth coeff */
+          int32_t coeff4 = IVP_EXTRVRN_2X32(hvecCoeffData4, inCh); /* 4th o/p depth coeff */
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1);
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2);
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3);
+          MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4);
+        } /* end of for (inCh = 0; inCh < numInCh; inCh += 4)*/
+          /* Corner case handling if number of inCh is not a multiple of 4 */
+        if (inCh < numInCh)
+        {
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3;
+
+          /* Read vector input data from 1st depth */
+          valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 1));
+
+          /* Read vector input data from 2nd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 2));
+
+          /* Read vector input data from 3rd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn1, 0);
+
+
+          /* extract scalar coeff from coeff vectors */
+          int32_t coeff1 = IVP_EXTRVRN_2X32(hvecCoeffData1, inCh); /* 1st o/p depth coeff */
+          int32_t coeff2 = IVP_EXTRVRN_2X32(hvecCoeffData2, inCh); /* 2nd o/p depth coeff */
+          int32_t coeff3 = IVP_EXTRVRN_2X32(hvecCoeffData3, inCh); /* 3rd o/p depth coeff */
+          int32_t coeff4 = IVP_EXTRVRN_2X32(hvecCoeffData4, inCh); /* 4th o/p depth coeff */
+
+          MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1);
+          MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2);
+          MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3);
+          MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4);
+        }
+
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Storing the first output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                       vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+
+        /* Storing the second output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \
+                       enable2ndRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 3rd output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 3rd output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut3H, dvecOut3L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * \
+                       enable2ndRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 4th output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable4thCh * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 4th output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut4H, dvecOut4L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable4thCh * \
+                       enable2ndRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 4 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 4 * coeffPitch3;
+      } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/
+    }   /* end of for (y = 0; y < outH; y++)*/
+  }     /* end of if ((numInCh <= 2*XCHAL_IVPN_SIMD_WIDTH))*/
+  else
+  {
+#ifdef __XCC__
+    XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+    /* generate the sequence 0,1,2,3,0 + coeffPitch3, 1 + coeffPitch3,
+     * 2 + coeffPitch3, 3 + coeffPitch3, 0 + 2 * coeffPitch3, 1 + 2 * coeffPitch3,
+     * 2 + 2 * coeffPitch3, 3 + 2 * coeffPitch3, .....
+     *
+     * for e.g, if coeffPitch3 is 32:
+     * 0,1,2,3,32,33,34,35,64,65,66,67,96,96,98,99,..
+     *
+     * This sequence is used to gather coeff from 4 diff output channels, 4 each from
+     * every channel corresponding to 4 i/p channels, as innermost loop(inCh) is unrolled by
+     * 4 to make use of quad multipler.
+     */
+    xb_vecNx16U vecIdx1 = IVP_SEQNX16();
+    vecIdx1 = IVP_PACKVRNRNX48(IVP_MULNX16(vecIdx1, coeffPitch3), 0);
+    vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 1), vecIdx1, IVP_SELI_16B_INTERLEAVE_1_LO);
+    vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 2), vecIdx1, IVP_SELI_32B_INTERLEAVE_1_LO);
+    xb_gsr gs0;
+
+    for (y = 0; y < outH; y += 2)        /* Loop across Output height */
+    {
+      xb_vecNx16U vecIdx2;
+      /* In order to handle odd rows*/
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y];
+
+      /* initialize  Bias data pointer */
+
+      int32_t *pBias = &pBiasData[0];
+      int8_t *pCoeff = &pCoeffData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 4)  /* Loop across Output depth */
+      {
+        /* In order to handle odd depths*/
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+        int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+        /* load and replicate bias data for each output channel */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+        xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+        dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+        IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+        /* boolean mask to gather coeffs, if all the four o/p channels
+         * are present 16 coeff are loaded.
+         */
+        vboolN mask = IVP_LTNX16(IVP_SEQNX16(), (xb_vecNx16) XT_MIN(((numOutCh - outCh) * 4), 16));
+        /* Assign valid address for predicated false lines */
+        vecIdx2 = IVP_MOVNX16UT(vecIdx1, 0, mask);
+
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+        pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+
+#ifdef IS_VISION_130
+        for (inCh = 0; inCh < numInCh - 3; inCh += 4)        /* Loop across input depth */
+        {
+          /* input vectors are read from 4 input depths at at time
+           * Scalar 32 bit coeff are extracted from the coeff vectors */
+
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+
+          /* Read vector input data from 1st depth */
+          MORPH_OP_L2_2Nx8(dvecData1, pdvecIn1, inDataPitch2);
+
+          /* Read vector input data from 2nd depth */
+          MORPH_OP_L2_2Nx8(dvecData2, pdvecIn1, 3 * inDataPitch2);
+
+          /* Read vector input data from 3rd depth */
+          MORPH_OP_L2_2Nx8(dvecData3, pdvecIn2, inDataPitch2);
+
+          /* Read vector input data from 4th depth */
+          MORPH_OP_L2_2Nx8(dvecData4, pdvecIn2, 3 * inDataPitch2);
+
+          /* gather the coeffs */
+          gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2);
+          xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0);
+
+          /* extract scalar coeff from coeff vectors */
+          int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */
+          int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */
+          int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */
+          int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1);
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2);
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3);
+          MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4);
+        }       /* end of for (inCh = 0; inCh < numInCh; inCh += 4)*/
+        if (inCh < numInCh)
+        {
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3;
+
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (((int8_t *) pdvecIn1) + 2 * inDataPitch2 * XT_SALT(inCh, numInCh - 2));
+          /* Read vector input data from 1st depth */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 1));
+
+          /* Read vector input data from 2nd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LAV2NX8_XP(dvecData2, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * XT_SALT(inCh, numInCh - 1));
+
+          /* Read vector input data from 3rd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          IVP_LAV2NX8_XP(dvecData3, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * XT_SALT(inCh, numInCh - 2));
+
+          /* Boolean mask for gather to handle cases where inCh<4 */
+          vboolN mask1 = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), (numInCh - inCh));
+          /* Assign valid address for predicated false lines */
+          vecIdx2 = IVP_MOVNX16UT(vecIdx2, 0, mask1);
+          /* Gather coeffs */
+          gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2);
+          xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0);
+
+          /* extract scalar coeff from coeff vectors */
+          int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */
+          int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */
+          int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */
+          int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */
+
+          MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1);
+          MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2);
+          MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3);
+          MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4);
+        } /* end of if (inCh < numInCh)*/
+#else
+        for (inCh = 0; inCh < numInCh - 3; inCh += 4)        /* Loop across input depth */
+        {
+          /* input vectors are read from 4 input depths at at time
+           * Scalar 32 bit coeff are extracted from the coeff vectors */
+
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+
+          /* Read vector input data from 1st depth */
+          valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2);
+
+          /* Read vector input data from 2nd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, 3 * inDataPitch2);
+
+          /* Read vector input data from 3rd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn2, inDataPitch2);
+
+          /* Read vector input data from 4th depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecData4, vaInData, pdvecIn2, 3 * inDataPitch2);
+
+          /* gather the coeffs */
+          gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2);
+          xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0);
+
+          /* extract scalar coeff from coeff vectors */
+          int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */
+          int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */
+          int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */
+          int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1);
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2);
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3);
+          MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4);
+        }       /* end of for (inCh = 0; inCh < numInCh; inCh += 4)*/
+        if (inCh < numInCh)
+        {
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3;
+
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (((int8_t *) pdvecIn1) + 2 * inDataPitch2 * XT_SALT(inCh, numInCh - 2));
+          /* Read vector input data from 1st depth */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 1));
+
+          /* Read vector input data from 2nd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LAV2NX8_XP(dvecData2, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * XT_SALT(inCh, numInCh - 1));
+
+          /* Read vector input data from 3rd depth */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          IVP_LAV2NX8_XP(dvecData3, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * XT_SALT(inCh, numInCh - 2));
+
+          /* Boolean mask for gather to handle cases where inCh<4 */
+          vboolN mask1 = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), (numInCh - inCh));
+          /* Assign valid address for predicated false lines */
+          vecIdx2 = IVP_MOVNX16UT(vecIdx2, 0, mask1);
+          /* Gather coeffs */
+          gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2);
+          xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0);
+
+          /* extract scalar coeff from coeff vectors */
+          int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */
+          int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */
+          int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */
+          int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                            IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */
+
+          MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1);
+          MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2);
+          MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3);
+          MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4);
+        } /* end of if (inCh < numInCh)*/
+#endif
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+
+        /* Storing the first output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                       vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+
+        /* Storing the second output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \
+                       enable2ndRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 3rd output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 3rd output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut3H, dvecOut3L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * \
+                       enable2ndRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 4th output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable4thCh * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 4th output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut4H, dvecOut4L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)),
+                       vaOutData, pdvecOut, bytesPerPixel * enable4thCh * \
+                       enable2ndRow * outW);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 4 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 4 * coeffPitch3;
+      } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/
+    }   /* end of for (y = 0; y < outH; y++)*/
+  }     /* end of else part of if ((numInCh <= 2*XCHAL_IVPN_SIMD_WIDTH))*/
+}
+
+/****************** xaiConvolvedVQ3D_S_1x1j1d1_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_1x1j1d1_U8S8IX_MOW_WHD ******************/
+/****************** xaiConvolved3D_S_1x1j1d1_S8S8IX_MOW_WHD *****************/
+/****************** xaiConvolved3D_S_1x1j1d1_U8S8IX_MOW_WHD *****************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_1x1j1d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 1);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_STRIDE(param, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                         \
+                    XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t outW          = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH          = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh       = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh      = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t inDataPitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+  const int32_t coeffPitch3   = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+  /* if pitch = width in input and output tile call the no edge variant*/
+  int32_t enableFlatten = ((inDataPitch1 == XAI_TILE3D_GET_DIM1(inTile)) && \
+                           (outDataPitch1 == outW) && (inDataPitch1 == outDataPitch1));
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    if ((XAI_TILE3D_IS_ALIGNED_2NX8(inTile) == 0) && (enableFlatten || (numInCh > 2 * XCHAL_IVPN_SIMD_WIDTH)))
+    {
+      XAI_CHECK_TILE4D_FITS_IN_SINGLE_DRAM(coeffTile);
+      if (numOutCh > 1)
+      {
+        /* Max value of Gather Offset is (min(numOutCh-1,3)*coeffPitch3 + min(numInCh-1, 3)) */
+        XAI_CHECK_ERROR(coeffPitch3 < ((USHRT_MAX - XT_MIN(numInCh - 1, 3)) / XT_MIN(numOutCh - 1, 3)),                              \
+                        XAI_ERR_BADARG, "\ndim3Pitch value of coeffTile = %d, should be less than Gather Offset(16-bit limit) - %d", \
+                        coeffPitch3, ((USHRT_MAX - XT_MIN(numInCh - 1, 3)) / XT_MIN(numOutCh - 1, 3)));
+      }
+    }
+  }
+  /* if pitch = width in input and output tile call the no edge variant*/
+  if (enableFlatten)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_1x1j1d1), S8IX_MOW_WHD_NOEDGE) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+  /* check inDataPitch1, if it is less than or equal to 16,
+   * call FOLD16 variant and if it's greater than
+   * 16 but less than or equal to 32 call FOLD32 variant otherwise continue
+   */
+  if (inDataPitch1 <= 16)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_1x1j1d1), S8IX_MOW_WHD_FOLD16) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+  if (inDataPitch1 <= 32)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_1x1j1d1), S8IX_MOW_WHD_FOLD32) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  uint8_t enableReLu          = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  const int32_t vectorizationWidth = 2 * XCHAL_IVPN_SIMD_WIDTH;
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  int32_t inCh, outCh, x, y;
+  MORPH_IDT_2Nx8 *restrict pdvecIn1;
+  MORPH_IDT_2Nx8 *restrict pdvecIn2;
+  MORPH_IDT_2Nx8 *restrict pdvecIn3;
+  MORPH_IDT_2Nx8 *restrict pdvecIn4;
+  xb_vec2Nx8 * restrict pdvecOut;
+  xb_vec2Nx8 * restrict pdvecCoeff1;
+  xb_vec2Nx8 * restrict pdvecCoeff2;
+  xb_vec2Nx8 * restrict pdvecCoeff3;
+  xb_vec2Nx8 * restrict pdvecCoeff4;
+  xb_vecN_2x32v * restrict phvecCoeff1;
+
+  /* The overall design approach is split into 2 sections, one
+   * with aligned input data and the other with unaligned input data.
+   * The implementation with aligned input data gives the best performance */
+
+  /* In the unaligned input data case, there are 2 implementations, one for
+   * input channels less than or equal to 64, and other for input channels
+   * greater than 64.
+   * Adding one more loop to support more than 64 input channels is causing
+   * significant overhead and degrades the the performance.
+   */
+
+  if (XAI_TILE3D_IS_ALIGNED_2NX8(inTile))
+  {
+    for (x = 0; x < outW; x += vectorizationWidth)  /* Loop across Output width */
+    {
+      for (y = 0; y < outH; y++)  /* Loop across Output height */
+      {
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+        /* initialize coeff and Bias data pointer */
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */
+        {
+          /* In order to handle odd depths*/
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+          int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+          int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+          /* Load the bias values corresponding to two output channels */
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+          xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+          xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+          xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+          dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+          dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+          dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+          IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+          dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+          IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+
+          /* Coefficient and input pointers */
+          int8_t *pCoeff = &pCoeffData[outCh * coeffPitch3];
+          pdvecCoeff1 = (xb_vec2Nx8 *) pCoeff;
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh);
+          pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh);
+          pdvecIn1    = (MORPH_IDT_2Nx8 *) (pInput);
+          pdvecIn2    = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2);
+          pdvecIn3    = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+          pdvecIn4    = (MORPH_IDT_2Nx8 *) (pInput + 3 * inDataPitch2);
+
+          /* Priming Loads for Coefficients */
+          valign vaCoeff1 = IVP_LA2NX8_PP(pdvecCoeff1);
+          valign vaCoeff2 = IVP_LA2NX8_PP(pdvecCoeff2);
+          valign vaCoeff3 = IVP_LA2NX8_PP(pdvecCoeff3);
+          valign vaCoeff4 = IVP_LA2NX8_PP(pdvecCoeff4);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4)  /* Loop across input depth */
+          {
+            /* input vectors are read from 4 input depths at at time
+             * Scalar 32 bit coeff are extracted from the coeff vectors */
+            MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+
+            /* Read vector input data from 1st depth */
+            MORPH_OP_ALIGN_LOAD_2Nx8(dvecData1, pdvecIn1, 4 * inDataPitch2);
+
+            /* Read vector input data from 2nd depth */
+            MORPH_OP_ALIGN_LOAD_2Nx8(dvecData2, pdvecIn2, 4 * inDataPitch2);
+
+            /* Read vector input data from 3rd depth */
+            MORPH_OP_ALIGN_LOAD_2Nx8(dvecData3, pdvecIn3, 4 * inDataPitch2);
+
+            /* Read vector input data from 4th depth */
+            MORPH_OP_ALIGN_LOAD_2Nx8(dvecData4, pdvecIn4, 4 * inDataPitch2);
+
+            xb_vec2Nx8 dvecCoeff1, dvecCoeff2, dvecCoeff3, dvecCoeff4;
+            IVP_LAV2NX8_XP(dvecCoeff1, vaCoeff1, pdvecCoeff1, 4);
+            IVP_LAV2NX8_XP(dvecCoeff2, vaCoeff2, pdvecCoeff2, 4);
+            IVP_LAV2NX8_XP(dvecCoeff3, vaCoeff3, pdvecCoeff3, 4);
+            IVP_LAV2NX8_XP(dvecCoeff4, vaCoeff4, pdvecCoeff4, 4);
+
+            int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                              (IVP_MOVNX16_FROM2NX8(dvecCoeff1)), 0);
+            int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                              (IVP_MOVNX16_FROM2NX8(dvecCoeff2)), 0);
+            int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                              (IVP_MOVNX16_FROM2NX8(dvecCoeff3)), 0);
+            int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                              (IVP_MOVNX16_FROM2NX8(dvecCoeff4)), 0);
+
+            MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1);
+            MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2);
+            MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3);
+            MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4);
+          } /* end of for (inCh = 0; inCh < numInCh - 3; inCh += 4)*/
+
+          /* Corner case handling if number of inCh is not a multiple of 4 */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh = numInCh - inCh;
+
+            /* input vectors are read from 4 input depths at at time
+             * Scalar 32 bit coeff are extracted from the coeff vectors */
+            MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3;
+
+            /* Read vector input data from 1st depth */
+            MORPH_OP_ALIGN_LOAD_2Nx8(dvecData1, pdvecIn1, inDataPitch2 * XT_SALT(1, remInCh));
+
+            /* Read vector input data from 2nd depth */
+            MORPH_OP_ALIGN_LOAD_2Nx8(dvecData2, pdvecIn1, inDataPitch2 * XT_SALT(2, remInCh));
+
+            /* Read vector input data from 3rd depth */
+            MORPH_OP_ALIGN_LOAD_2Nx8(dvecData3, pdvecIn1, 0);
+
+            xb_vec2Nx8 dvecCoeff1, dvecCoeff2, dvecCoeff3, dvecCoeff4;
+            IVP_LAV2NX8_XP(dvecCoeff1, vaCoeff1, pdvecCoeff1, remInCh);
+            IVP_LAV2NX8_XP(dvecCoeff2, vaCoeff2, pdvecCoeff2, remInCh);
+            IVP_LAV2NX8_XP(dvecCoeff3, vaCoeff3, pdvecCoeff3, remInCh);
+            IVP_LAV2NX8_XP(dvecCoeff4, vaCoeff4, pdvecCoeff4, remInCh);
+
+            int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                              (IVP_MOVNX16_FROM2NX8(dvecCoeff1)), 0);
+            int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                              (IVP_MOVNX16_FROM2NX8(dvecCoeff2)), 0);
+            int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                              (IVP_MOVNX16_FROM2NX8(dvecCoeff3)), 0);
+            int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                              (IVP_MOVNX16_FROM2NX8(dvecCoeff4)), 0);
+
+            MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1);
+            MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2);
+            MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3);
+            MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4);
+          } /* end of corner case handling*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                        pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                        pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* variable store count */
+          int32_t varLen = outW - x;
+
+          /* store output to 1st output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData; vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* store output to 2nd output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* store output to 3rd output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable3rdCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* store output to 4th output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable4thCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 4 * outDataPitch2 * bytesPerPixel;
+        } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/
+      }   /* end of for (y = 0; y < outH; y++)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else
+  {
+    if ((numInCh <= 2 * XCHAL_IVPN_SIMD_WIDTH))
+    {
+      for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */
+      {
+        for (y = 0; y < outH; y++) /* Loop across Output height */
+        {
+          /* initialize output data pointer */
+          int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+          /* initialize input data pointer */
+          MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+          /* initialize coeff and Bias data pointer */
+          int8_t *pCoeff = &pCoeffData[0];
+          int32_t *pBias = &pBiasData[0];
+
+          for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */
+          {
+            /* In order to handle odd depths*/
+            int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+            int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+            int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+            /* Load the bias values corresponding to two output channels */
+            xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+            xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+            xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+            xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+
+            xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+            dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+            IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+            dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+            IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+            dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+            IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+            dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+            IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+
+            /* variables for coeff loads */
+            xb_vecN_2x32v hvecCoeffData1, hvecCoeffData2, hvecCoeffData3, hvecCoeffData4;
+
+            /* read coeff vectors , for 4 consecutive output depths */
+            /* coeff vector for 1st output channel */
+            phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff);
+            valign vaCoeffData; vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1);
+            IVP_LAVN_2X32_XP(hvecCoeffData1, vaCoeffData, phvecCoeff1, coeffPitch3);
+
+            /* coeff vector for 2nd output channel */
+            vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1);
+            IVP_LAVN_2X32_XP(hvecCoeffData2, vaCoeffData, phvecCoeff1, coeffPitch3 * enable2ndCh);
+
+            /* coeff vector for 3rd output channel */
+            vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1);
+            IVP_LAVN_2X32_XP(hvecCoeffData3, vaCoeffData, phvecCoeff1, coeffPitch3 * enable3rdCh);
+
+            /* coeff vector for 4th output channel */
+            vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1);
+            IVP_LAVN_2X32_XP(hvecCoeffData4, vaCoeffData, phvecCoeff1, coeffPitch3 * enable4thCh);
+
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+            pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+
+            for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Loop across input depth */
+            {
+              /* input vectors are read from 4 input depths at at time
+               * Scalar 32 bit coeff are extracted from the coeff vectors */
+
+              MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+
+              /* Read vector input data from 1st depth */
+              valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2);
+
+              /* Read vector input data from 2nd depth */
+              vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, 3 * inDataPitch2);
+
+              /* Read vector input data from 3rd depth */
+              vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+              MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn2, inDataPitch2);
+
+              /* Read vector input data from 4th depth */
+              vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+              MORPH_OP_LOAD_2Nx8(dvecData4, vaInData, pdvecIn2, 3 * inDataPitch2);
+
+              /* extract scalar coeff from coeff vectors */
+              int32_t coeff1 = IVP_EXTRVRN_2X32(hvecCoeffData1, inCh); /* 1st o/p depth coeff */
+              int32_t coeff2 = IVP_EXTRVRN_2X32(hvecCoeffData2, inCh); /* 2nd o/p depth coeff */
+              int32_t coeff3 = IVP_EXTRVRN_2X32(hvecCoeffData3, inCh); /* 3rd o/p depth coeff */
+              int32_t coeff4 = IVP_EXTRVRN_2X32(hvecCoeffData4, inCh); /* 4th o/p depth coeff */
+
+              MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1);
+              MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2);
+              MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3);
+              MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4);
+            } /* end of for (inCh = 0; inCh < numInCh - 3; inCh += 4)*/
+              /* Corner case handling if number of inCh is not a multiple of 4 */
+            if (inCh < numInCh)
+            {
+              MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3;
+
+              /* Read vector input data from 1st depth */
+              valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 1));
+
+              /* Read vector input data from 2nd depth */
+              vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 2));
+
+              /* Read vector input data from 3rd depth */
+              vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn1, inDataPitch2);
+
+
+              /* extract scalar coeff from coeff vectors */
+              int32_t coeff1 = IVP_EXTRVRN_2X32(hvecCoeffData1, inCh); /* 1st o/p depth coeff */
+              int32_t coeff2 = IVP_EXTRVRN_2X32(hvecCoeffData2, inCh); /* 2nd o/p depth coeff */
+              int32_t coeff3 = IVP_EXTRVRN_2X32(hvecCoeffData3, inCh); /* 3rd o/p depth coeff */
+              int32_t coeff4 = IVP_EXTRVRN_2X32(hvecCoeffData4, inCh); /* 4th o/p depth coeff */
+
+              MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1);
+              MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2);
+              MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3);
+              MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4);
+            }
+
+            /* Pack, Output Scale, Output Shift and clamping */
+            xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+            xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+            PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                          pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+            PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                          pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+            PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                          pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+            PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                          pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+            PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                          outScale, outShiftU, minLim, maxLim, typeFlag);
+            PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                          outScale, outShiftU, minLim, maxLim, typeFlag);
+            PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                          outScale, outShiftU, minLim, maxLim, typeFlag);
+            PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                          outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+            /* variable store count */
+            int32_t varLen = outW - x;
+
+            /* store output to 1st output depth */
+            pdvecOut = (xb_vec2Nx8 *) (pOutput);
+            valign vaOutData; vaOutData = IVP_ZALIGN();
+            IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+            IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                           2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+            IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+            /* store output to 2nd output depth */
+            pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+            IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+            IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                           2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+            IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+            /* store output to 3rd output depth */
+            pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+            IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+            IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                           2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable3rdCh);
+            IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+            /* store output to 4th output depth */
+            pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+            IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh);
+            IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                           2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable4thCh);
+            IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+            pOutput += 4 * outDataPitch2 * bytesPerPixel;
+            pCoeff  += 4 * coeffPitch3;
+          } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/
+        }   /* end of for (y = 0; y < outH; y++)*/
+      }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+    }       /* end of if ((numInCh <= 2*XCHAL_IVPN_SIMD_WIDTH))*/
+    else
+    {
+#ifdef __XCC__
+      XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+      /* generate the sequence 0,1,2,3,0 + coeffPitch3, 1 + coeffPitch3,
+       * 2 + coeffPitch3, 3 + coeffPitch3, 0 + 2 * coeffPitch3, 1 + 2 * coeffPitch3,
+       * 2 + 2 * coeffPitch3, 3 + 2 * coeffPitch3, .....
+       *
+       * for e.g, if coeffPitch3 is 32:
+       * 0,1,2,3,32,33,34,35,64,65,66,67,96,96,98,99,..
+       *
+       * This sequence is used to gather coeff from 4 diff output channels, 4 each from
+       * every channel corresponding to 4 i/p channels, as innermost loop(inCh) is unrolled by
+       * 4 to make use of quad multipler.
+       */
+      xb_vecNx16U vecIdx1 = IVP_SEQNX16();
+      vecIdx1 = IVP_PACKVRNRNX48(IVP_MULNX16(vecIdx1, coeffPitch3), 0);
+      vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 1), vecIdx1, IVP_SELI_16B_INTERLEAVE_1_LO);
+      vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 2), vecIdx1, IVP_SELI_32B_INTERLEAVE_1_LO);
+      xb_gsr gs0;
+
+      for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+      {
+        /* variable store count */
+        int32_t varLen = outW - x;
+        xb_vecNx16U vecIdx2;
+        for (y = 0; y < outH; y++)        /* Loop across Output height */
+        {
+          /* initialize output data pointer */
+          int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+          /* initialize input data pointer */
+          MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+          /* initialize  Bias data pointer */
+
+          int32_t *pBias = &pBiasData[0];
+          int8_t *pCoeff = &pCoeffData[0];
+
+          for (outCh = 0; outCh < numOutCh; outCh += 4)  /* Loop across Output depth */
+          {
+            /* In order to handle odd depths*/
+            int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+            int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+            int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+            /* load and replicate bias data for each output channel */
+            xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+            xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+            xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+            xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+            xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+            dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+            IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+            dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+            IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+            dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+            IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+            dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+            IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+            /* boolean mask to gather coeffs, if all the four o/p channels
+             * are present 16 coeff are loaded.
+             */
+            vboolN mask = IVP_LTNX16(IVP_SEQNX16(), (xb_vecNx16) XT_MIN(((numOutCh - outCh) * 4), 16));
+            /* Assign valid address for predicated false lines */
+            vecIdx2 = IVP_MOVNX16UT(vecIdx1, 0, mask);
+
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+            pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+
+            for (inCh = 0; inCh < numInCh - 3; inCh += 4)        /* Loop across input depth */
+            {
+              /* input vectors are read from 4 input depths at at time
+               * Scalar 32 bit coeff are extracted from the coeff vectors */
+
+              MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+
+              /* Read vector input data from 1st depth */
+              valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2);
+
+              /* Read vector input data from 2nd depth */
+              vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, 3 * inDataPitch2);
+
+              /* Read vector input data from 3rd depth */
+              vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+              MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn2, inDataPitch2);
+
+              /* Read vector input data from 4th depth */
+              vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+              MORPH_OP_LOAD_2Nx8(dvecData4, vaInData, pdvecIn2, 3 * inDataPitch2);
+
+              /* gather the coeffs */
+              gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2);
+              xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0);
+
+              /* extract scalar coeff from coeff vectors */
+              int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                                IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */
+              int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                                IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */
+              int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                                IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */
+              int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                                IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */
+
+              MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1);
+              MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2);
+              MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3);
+              MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4);
+            }       /* end of for (inCh = 0; inCh < numInCh; inCh += 4)*/
+            if (inCh < numInCh)
+            {
+              MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3;
+
+              pdvecIn2 = (MORPH_IDT_2Nx8 *) (((int8_t *) pdvecIn1) + 2 * inDataPitch2 * XT_SALT(inCh, numInCh - 2));
+              /* Read vector input data from 1st depth */
+              valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 1));
+
+              /* Read vector input data from 2nd depth */
+              vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              IVP_LAV2NX8_XP(dvecData2, vaInData, pdvecIn1, varLen * XT_SALT(inCh, numInCh - 1));
+
+              /* Read vector input data from 3rd depth */
+              vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+              IVP_LAV2NX8_XP(dvecData3, vaInData, pdvecIn2, varLen * XT_SALT(inCh, numInCh - 2));
+
+              /* Boolean mask for gather to handle cases where inCh<4 */
+              vboolN mask1 = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), (numInCh - inCh));
+              /* Assign valid address for predicated false lines */
+              vecIdx2 = IVP_MOVNX16UT(vecIdx2, 0, mask1);
+              /* Gather coeffs */
+              gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2);
+              xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0);
+
+              /* extract scalar coeff from coeff vectors */
+              int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                                IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */
+              int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                                IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */
+              int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                                IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */
+              int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                                IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */
+
+              MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1);
+              MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2);
+              MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3);
+              MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4);
+            } /* end of if (inCh < numInCh)*/
+
+            /* Pack, Output Scale, Output Shift and clamping */
+            xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+            xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+            PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                          pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+            PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                          pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+            PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                          pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+            PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                          pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+            PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                          outScale, outShiftU, minLim, maxLim, typeFlag);
+            PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                          outScale, outShiftU, minLim, maxLim, typeFlag);
+            PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                          outScale, outShiftU, minLim, maxLim, typeFlag);
+            PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                          outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+            /* store output to 1st output depth */
+            pdvecOut = (xb_vec2Nx8 *) (pOutput);
+            valign vaOutData; vaOutData = IVP_ZALIGN();
+            IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+            IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                           2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+            IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+            /* store output to 2nd output depth */
+            pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+            IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+            IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                           2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+            IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+            /* store output to 3rd output depth */
+            pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+            IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+            IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                           2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable3rdCh);
+            IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+            /* store output to 4th output depth */
+            pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+            IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh);
+            IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                           2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable4thCh);
+            IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+            pOutput += 4 * outDataPitch2 * bytesPerPixel;
+            pCoeff  += 4 * coeffPitch3;
+          } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/
+        }   /* end of for (y = 0; y < outH; y++)*/
+      }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+    }       /* end of else part of if ((numInCh <= 2*XCHAL_IVPN_SIMD_WIDTH))*/
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolved(VQ)3D_S_1x1j2d1I8S8IX_MOW_WHD
+*  **************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 1x1 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 1x1 3D MOW_WHD dilated convolution function */
+/*               and 1x1 3D VQ MOW_WHD dilated convolution function for U8    */
+/*               bit and S8 bit input data with input stride equal to 2       */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 1x1xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_1x1j2d1_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_1x1j2d1_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_1x1j2d1_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_1x1j2d1_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_1x1j2d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 1);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_STRIDE(param, 2);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                         \
+                    XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \
+                      XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile);
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    if (numInCh > 64)
+    {
+      XAI_CHECK_TILE4D_FITS_IN_SINGLE_DRAM(coeffTile);
+      if (numOutCh > 1)
+      {
+        /* Max value of Gather Offset is (min(numOutCh-1,3)*coeffPitch3 + min(numInCh-1, 3)) */
+        XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM3_PITCH(coeffTile) <                                                       \
+                        ((USHRT_MAX - XT_MIN(numInCh - 1, 3)) / XT_MIN(numOutCh - 1, 3)), XAI_ERR_BADARG,            \
+                        "\ndim3Pitch value of coeffTile = %d, should be less than Gather Offset(16-bit limit) - %d", \
+                        XAI_TILE4D_GET_DIM3_PITCH(coeffTile), ((USHRT_MAX - XT_MIN(numInCh - 1, 3)) / XT_MIN(numOutCh - 1, 3)));
+      }
+    }
+  }
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Pitches of Coefficient Data (WHDN) dim3 */
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  const int32_t vectorizationWidth = 2 * XCHAL_IVPN_SIMD_WIDTH;
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* variable declarations */
+  int32_t inCh, outCh, x, y;
+  MORPH_IDT_2Nx8 *restrict pdvecIn1;
+  MORPH_IDT_2Nx8 *restrict pdvecIn2;
+  xb_vec2Nx8 * restrict pdvecOut;
+  xb_vecN_2x32v * restrict phvecCoeff;
+
+
+  /* The overall design approach is split into 2 sections, one handles
+   * optimal tile sizes for giving best performance, other handles rest
+   * of the tile sizes */
+
+  /* If sections check out for optimal input tile size for best performance.
+   * if input tile depth is  lesser than or  equal to 64 use
+   * this design approach, otherwise jump to else part. Adding one more loop
+   * to support more than 64 input channels is causing significant overhead
+   * damaging the performance */
+  if ((numInCh <= 2 * XCHAL_IVPN_SIMD_WIDTH))
+  {
+    /* Loop structure Starts with loop across output channels */
+    for (x = 0; x < outW; x += vectorizationWidth) /* loop across output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+
+      for (y = 0; y < outH; y++) /* Loop across Output height */
+      {
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x];
+
+        /* initialize coeff and Bias data pointer */
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */
+        {
+          /* In order to handle odd depths*/
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+          int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+          int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+          /* coeff and input data vector declaration */
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+          MORPH_IDT_2Nx8 dvecDataL, dvecDataU;
+          xb_vecN_2x32v hvecCoeffData1, hvecCoeffData2, hvecCoeffData3, hvecCoeffData4;
+
+          /* read coeff vectors , for 4 consecutive output depths */
+          /* coeff vector for 1st output channel */
+          phvecCoeff = (xb_vecN_2x32v *) (pCoeff);
+          valign vaCoeffData; vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff);
+          IVP_LAVN_2X32_XP(hvecCoeffData1, vaCoeffData, phvecCoeff, coeffPitch3);
+
+          /* coeff vector for 2nd output channel */
+          vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff);
+          IVP_LAVN_2X32_XP(hvecCoeffData2, vaCoeffData, phvecCoeff, coeffPitch3 * enable2ndCh);
+
+          /* coeff vector for 3rd output channel */
+          vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff);
+          IVP_LAVN_2X32_XP(hvecCoeffData3, vaCoeffData, phvecCoeff, coeffPitch3 * enable3rdCh);
+
+          /* coeff vector for 4th output channel */
+          vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff);
+          IVP_LAVN_2X32_XP(hvecCoeffData4, vaCoeffData, phvecCoeff, coeffPitch3 * enable4thCh);
+
+          /* load and replicate bias data for each output channel */
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+          xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+          xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+          /* Move data from N way 16 bit vecBias registers to
+           * 2N way 24 bit accumulators*/
+          xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+          dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+          dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+          dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+          IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+          dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+          IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* loop across input channels */
+          {
+            /* load data from 1st input channel */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn1, vectorizationWidth * flag);
+            MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn1, \
+                               inDataPitch2 - vectorizationWidth * flag);
+            dvecData1 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+            /* load data from 2nd input channel */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn1, vectorizationWidth * flag);
+            MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn1, \
+                               3 * inDataPitch2 - vectorizationWidth * flag);
+            dvecData2 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+
+            /* load data from 3rd input channel */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn2, vectorizationWidth * flag);
+            MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn2, \
+                               inDataPitch2 - vectorizationWidth * flag);
+            dvecData3 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+
+            /* load data from 4th input channel */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn2, vectorizationWidth * flag);
+            MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn2, \
+                               3 * inDataPitch2 - vectorizationWidth * flag);
+            dvecData4 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+
+            /* extract scalar coeff from coeff vectors */
+            int32_t coeff1 = IVP_EXTRVRN_2X32(hvecCoeffData1, inCh); /* 1st o/p depth coeff */
+            int32_t coeff2 = IVP_EXTRVRN_2X32(hvecCoeffData2, inCh); /* 2nd o/p depth coeff */
+            int32_t coeff3 = IVP_EXTRVRN_2X32(hvecCoeffData3, inCh); /* 3rd o/p depth coeff */
+            int32_t coeff4 = IVP_EXTRVRN_2X32(hvecCoeffData4, inCh); /* 4th o/p depth coeff */
+
+            MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1);
+            MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2);
+            MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3);
+            MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4);
+          } /* for (inCh = 0; inCh < numInCh; inCh += 4)*/
+
+          if (inCh < numInCh)
+          {
+            /* load data from 1st input channel */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn1, vectorizationWidth * flag);
+            MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn1, \
+                               (inDataPitch2 - vectorizationWidth * flag) * XT_SALT(inCh, numInCh - 1));
+            dvecData1 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+            /* load data from 2nd input channel */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn1, vectorizationWidth * flag);
+            MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn1, \
+                               (inDataPitch2 - vectorizationWidth * flag) * XT_SALT(inCh, numInCh - 2));
+            dvecData2 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+
+            /* load data from 3rd input channel */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn1, vectorizationWidth * flag);
+            MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn1, \
+                               inDataPitch2 - vectorizationWidth * flag);
+            dvecData3 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+
+            /* extract scalar coeff from coeff vectors */
+            int32_t coeff1 = IVP_EXTRVRN_2X32(hvecCoeffData1, inCh); /* 1st o/p depth coeff */
+            int32_t coeff2 = IVP_EXTRVRN_2X32(hvecCoeffData2, inCh); /* 2nd o/p depth coeff */
+            int32_t coeff3 = IVP_EXTRVRN_2X32(hvecCoeffData3, inCh); /* 3rd o/p depth coeff */
+            int32_t coeff4 = IVP_EXTRVRN_2X32(hvecCoeffData4, inCh); /* 4th o/p depth coeff */
+
+            MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1);
+            MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2);
+            MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3);
+            MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4);
+          }
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                        pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                        pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* variable store count */
+          int32_t varLen = outW - x;
+
+          /* store output to 1st output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData; vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* store output to 2nd output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* store output to 3rd output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable3rdCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* store output to 4th output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable4thCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 4 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 4 * coeffPitch3;
+        }   /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/
+      }     /* end of for (y = 0; y < outH ; y++)*/
+    }       /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }         /* end of if ((numInCh <= 2*XCHAL_IVPN_SIMD_WIDTH))*/
+  else
+  {
+#ifdef __XCC__
+    XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+    /* generate the sequence 0,1,2,3,0 + coeffPitch3, 1 + coeffPitch3,
+     * 2 + coeffPitch3, 3 + coeffPitch3, 0 + 2 * coeffPitch3, 1 + 2 * coeffPitch3,
+     * 2 + 2 * coeffPitch3, 3 + 2 * coeffPitch3, .....
+     *
+     * for e.g, if coeffPitch3 is 32:
+     * 0,1,2,3,32,33,34,35,64,65,66,67,96,96,98,99,..
+     *
+     * This sequene is used to gather coeff from 4 diff output channels, 4 each from
+     * every channel corresponding to 4 i/p channels, as innermost loop(inCh) is unrolled by
+     * 4 to make use of quad multipler.
+     */
+    xb_vecNx16U vecIdx1 = IVP_SEQNX16();
+    vecIdx1 = IVP_PACKVRNRNX48(IVP_MULNX16(vecIdx1, coeffPitch3), 0);
+    vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 1), vecIdx1, IVP_SELI_16B_INTERLEAVE_1_LO);
+    vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 2), vecIdx1, IVP_SELI_32B_INTERLEAVE_1_LO);
+    xb_gsr gs0;
+
+    for (x = 0; x < outW; x += vectorizationWidth)     /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+      xb_vecNx16U vecIdx2;
+      /* variable store count */
+      int32_t varLen = outW - x;
+
+      for (y = 0; y < outH; y++)          /* Loop across Output height */
+      {
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x];
+
+        /* initialize  Bias data pointer */
+
+        int32_t *pBias = &pBiasData[0];
+        int8_t *pCoeff = &pCoeffData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 4)    /* Loop across Output depth */
+        {
+          /* In order to handle odd depths*/
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+          int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+          int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+          /* load and replicate bias data for each output channel */
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+          xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+          xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+          xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+          dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+          dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+          dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+          IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+          dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+          IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+          /* boolean mask to gather coeffs, if all the four o/p channels
+           * are present 16 coeff are loaded.
+           */
+          vboolN mask = IVP_LTNX16(IVP_SEQNX16(), (xb_vecNx16) XT_MIN(((numOutCh - outCh) * 4), 16));
+          /* Assign valid address for predicated false lines */
+          vecIdx2  = IVP_MOVNX16UT(vecIdx1, 0, mask);
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4)          /* Loop across input depth */
+          {
+            /* input vectors are read from 4 input depths at at time
+             * Scalar 32 bit coeff are extracted from the coeff vectors */
+
+            MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+            MORPH_IDT_2Nx8 dvecDataL, dvecDataU;
+
+            /* load data from 1st input channel */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn1, vectorizationWidth * flag);
+            MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn1, \
+                               inDataPitch2 - vectorizationWidth * flag);
+            dvecData1 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+
+            /* load data from 2nd input channel */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn1, vectorizationWidth * flag);
+            MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn1, \
+                               3 * inDataPitch2 - vectorizationWidth * flag);
+            dvecData2 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+
+            /* load data from 3rd input channel */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn2, vectorizationWidth * flag);
+            MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn2, \
+                               inDataPitch2 - vectorizationWidth * flag);
+            dvecData3 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+
+            /* load data from 4th input channel */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn2, vectorizationWidth * flag);
+            MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn2, \
+                               3 * inDataPitch2 - vectorizationWidth * flag);
+            dvecData4 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+            /* gather the coeffs */
+            gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2);
+            xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0);
+
+            /* extract scalar coeff from coeff vectors */
+            int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                              IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */
+            int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                              IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */
+            int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                              IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */
+            int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                              IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */
+
+            MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1);
+            MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2);
+            MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3);
+            MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4);
+          }        /* end of for (inCh = 0; inCh < numInCh; inCh += 4)*/
+          if (inCh < numInCh)
+          {
+            MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3;
+            MORPH_IDT_2Nx8 dvecDataL, dvecDataU;
+
+            /* load data from 1st input channel */
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8_VARIABLE(dvecDataL, vaInData, pdvecIn1, (inW - stride * x));
+            MORPH_OP_LOAD_2Nx8_VARIABLE(dvecDataU, vaInData, pdvecIn1, \
+                                        (inW - stride * x - vectorizationWidth));
+            dvecData1 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+            /* load data from 2nd input channel */
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + ((inCh + 1) * inDataPitch2 * XT_SALT(inCh, numInCh - 1)));
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8_VARIABLE(dvecDataL, vaInData, pdvecIn1, \
+                                        (inW - stride * x) * XT_SALT(inCh, numInCh - 1));
+            MORPH_OP_LOAD_2Nx8_VARIABLE(dvecDataU, vaInData, pdvecIn1, \
+                                        (inW - stride * x - vectorizationWidth) * XT_SALT(inCh, numInCh - 1));
+            dvecData2 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+
+
+            /* load data from 3rd input channel */
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + ((inCh + 2) * inDataPitch2 * XT_SALT(inCh, numInCh - 2)));
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8_VARIABLE(dvecDataL, vaInData, pdvecIn1, \
+                                        (inW - stride * x) * XT_SALT(inCh, numInCh - 2));
+            MORPH_OP_LOAD_2Nx8_VARIABLE(dvecDataU, vaInData, pdvecIn1, \
+                                        (inW - stride * x - vectorizationWidth) * XT_SALT(inCh, numInCh - 2));
+            dvecData3 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+
+            /* Boolean mask for gather to handle cases where inCh<4 */
+            vboolN mask1 = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), (numInCh - inCh));
+            /* Assign valid address for predicated false lines */
+            vecIdx2 = IVP_MOVNX16UT(vecIdx2, 0, mask1);
+            /* Gather coeffs */
+            gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2);
+            xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0);
+
+            /* extract scalar coeff from coeff vectors */
+            int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                              IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */
+            int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                              IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */
+            int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                              IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */
+            int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                              IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */
+
+            MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1);
+            MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2);
+            MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3);
+            MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4);
+          } /* end of if (inCh < numInCh)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                        pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                        pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* store output to 1st output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData; vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* store output to 2nd output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* store output to 3rd output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable3rdCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* store output to 4th output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable4thCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 4 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 4 * coeffPitch3;
+        }   /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/
+      }     /* end of for (y = 0; y < outH; y++)*/
+    }       /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }         /* end of else part of if ((numInCh <= 2*XCHAL_IVPN_SIMD_WIDTH))*/
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+*  xaiConvolved(VQ)3D_S_1x1j4d1I8S8IX_MOW_WHD
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 1x1 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 1x1 3D MOW_WHD dilated convolution function */
+/*               and 1x1 3D VQ MOW_WHD dilated convolution function for U8    */
+/*               bit and S8 bit input data with input stride equal to 4       */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 1x1xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_1x1j4d1_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_1x1j4d1_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_1x1j4d1_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_1x1j4d1_U8S8IX_MOW_WHD *******************/
+//#if 0
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_1x1j4d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 1);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_STRIDE(param, 4);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                         \
+                    XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inTile) == XAI_TILE4D_GET_DIM3(coeffTile),                                               \
+                    XAI_ERR_BADARG, "\ninTile depth = %d, coeffTile depth = %d\ninTile depth should be same as coeffTile depth", \
+                    XAI_TILE3D_GET_DIM3(inTile), XAI_TILE4D_GET_DIM3(coeffTile));
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTile) == XAI_TILE4D_GET_DIM4(coeffTile),                                                    \
+                    XAI_ERR_BADARG, "\noutTile depth = %d, number of kernels = %d\noutTile depth should be same as number of kernels", \
+                    XAI_TILE3D_GET_DIM3(outTile), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Pitches of Coefficient Data (WHDN) dim3 */
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vecNx8* restrict pvecCoeff1, * restrict pvecCoeff2, \
+  * restrict pvecCoeff3, * restrict pvecCoeff4;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+  const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH;
+  int32_t varLen;
+
+  /* declare gather registers and compute the sequence
+   * 0,4,8,12, ....... 120, 124 required as offset for
+   * gahering data
+   */
+  xb_gsr gs0, gs1;
+  xb_vecNx16U vecIdx1 = IVP_SEQNX16() << 2;
+
+#ifdef __XCC__
+  XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */
+#endif
+
+  /* loop across output depth is unrolled by 4
+   * , producing lanes from 4 output channels
+   * in one iteration. Since vectorization width
+   * is just half the width of the accumulator,
+   * loop across output height is also unrolled by 2.
+   * Unrolling across output height makes it possible
+   * to utilize all the 64 MACs in the accumulator.
+   *
+   * Data loaded from the 2 input rows is concatenated
+   * in such a manner that lower half of the output
+   * vector gives the first output row and the upper
+   * half of the */
+  for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */
+  {
+    /* variable load and store count */
+    varLen = XT_MIN(outW - x, vectorizationWidth);
+
+    for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+    {
+      /* In order to handle odd heights */
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+      xb_vecNx16U vecIdx2;
+      /* Initialize o/p data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize coeff and Bias data pointer */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 4)   /* Loop across Output depth */
+      {
+        /* In order to handle odd  depths*/
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+        int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+        /* Initialize i/p data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y * stride + x * stride];
+
+        /* load and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+        xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+        dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+        IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pvecCoeff1 = (xb_vecNx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LANX8S_PP(pvecCoeff1);
+
+        pvecCoeff2 = (xb_vecNx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LANX8S_PP(pvecCoeff2);
+
+        pvecCoeff3 = (xb_vecNx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh);
+        valign vaCoeffData3; vaCoeffData3 = IVP_LANX8S_PP(pvecCoeff3);
+
+        pvecCoeff4 = (xb_vecNx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh);
+        valign vaCoeffData4; vaCoeffData4 = IVP_LANX8S_PP(pvecCoeff4);
+
+        /* mask for gathering input data based on varLen */
+        vboolN mask1 = IVP_LTNX16(IVP_SEQNX16(), (xb_vecNx16) varLen);
+        /* Assign valid address for predicated false lines */
+        vecIdx2 = IVP_MOVNX16UT(vecIdx1, 0, mask1);
+
+        /* loop acrosss input channels is unrolled by 2,
+         * enabling us to use paired multipliers
+         */
+
+        /* 32 elements are gathered from the 1st input height
+         * in the gs0 register and then 32 elements are gathered
+         * from the next input height(as loop across output height
+         * is unrolled by 2) in the gs1 register. So lower half of
+         * the dvecData1 hold the data from 1st input height and
+         * upper half holds the data from the 2nd input height
+         *
+         * Similarly dvecData2 holds data from the 2nd input channel,
+         * lower half hold 1st input height and upper half holds 2nd
+         * input height
+         */
+
+        for (inCh = 0; inCh < numInCh - 1; inCh += 2)   /* Loop across input channels */
+        {
+          /* variable declarations for input and coeff vectors */
+          MORPH_IDT_2Nx8 dvecData1, dvecData2;
+
+          /* loads data from 1st input channel */
+          gs0       = MORPH_OP_GATHER(pInput, vecIdx2);
+          gs1       = MORPH_OP_GATHER(pInput + stride * inDataPitch1 * enable2ndRow, vecIdx2);
+          dvecData1 = MORPH_OP_GATHER_2Nx8_LOW(gs0);
+          MORPH_OP_GATHER_2Nx8_HIGH(dvecData1, gs1);
+          pInput += inDataPitch2;
+
+          /* loads data from next input channel */
+          gs0       = MORPH_OP_GATHER(pInput, vecIdx2);
+          gs1       = MORPH_OP_GATHER(pInput + stride * inDataPitch1 * enable2ndRow, vecIdx2);
+          dvecData2 = MORPH_OP_GATHER_2Nx8_LOW(gs0);
+          MORPH_OP_GATHER_2Nx8_HIGH(dvecData2, gs1);
+          pInput += inDataPitch2;
+
+          /* load 2 coeff for all the 4 output channels, 8 to 16 bit
+           * conversion is taken care of by the load instruction  */
+          xb_vecNx16 vecCoeffData1, vecCoeffData2, vecCoeffData3, vecCoeffData4;
+          IVP_LAVNX8S_XP(vecCoeffData1, vaCoeffData1, pvecCoeff1, 2);
+          IVP_LAVNX8S_XP(vecCoeffData2, vaCoeffData2, pvecCoeff2, 2);
+          IVP_LAVNX8S_XP(vecCoeffData3, vaCoeffData3, pvecCoeff3, 2);
+          IVP_LAVNX8S_XP(vecCoeffData4, vaCoeffData4, pvecCoeff4, 2);
+
+          /* multiply data from 1st input channel with 1st coeff
+           * and data from 2nd input channel with 2nd coeff and
+           * accumulate
+           */
+          MORPH_OP_MULPA(dacc1, dvecData2, dvecData1, \
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(vecCoeffData1), 0));
+          MORPH_OP_MULPA(dacc2, dvecData2, dvecData1, \
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(vecCoeffData2), 0));
+          MORPH_OP_MULPA(dacc3, dvecData2, dvecData1, \
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(vecCoeffData3), 0));
+          MORPH_OP_MULPA(dacc4, dvecData2, dvecData1, \
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(vecCoeffData4), 0));
+        }   /* end of for (inCh = 0; inCh < numInCh - 1; inCh += 2)*/
+
+        /* handles left out odd input channel */
+        if (inCh < numInCh)
+        {
+          MORPH_IDT_2Nx8 dvecData1;
+
+          /* loads data from the left out input channel */
+          gs0       = MORPH_OP_GATHER(pInput, vecIdx2);
+          gs1       = MORPH_OP_GATHER(pInput + stride * inDataPitch1 * enable2ndRow, vecIdx2);
+          dvecData1 = MORPH_OP_GATHER_2Nx8_LOW(gs0);
+          MORPH_OP_GATHER_2Nx8_HIGH(dvecData1, gs1);
+
+          xb_vecNx16 vecCoeffData1, vecCoeffData2, vecCoeffData3, vecCoeffData4;
+          IVP_LAVNX8S_XP(vecCoeffData1, vaCoeffData1, pvecCoeff1, 1);
+          IVP_LAVNX8S_XP(vecCoeffData2, vaCoeffData2, pvecCoeff2, 1);
+          IVP_LAVNX8S_XP(vecCoeffData3, vaCoeffData3, pvecCoeff3, 1);
+          IVP_LAVNX8S_XP(vecCoeffData4, vaCoeffData4, pvecCoeff4, 1);
+
+          MORPH_OP_MULPA(dacc1, 0, dvecData1, \
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(vecCoeffData1), 0));
+          MORPH_OP_MULPA(dacc2, 0, dvecData1, \
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(vecCoeffData2), 0));
+          MORPH_OP_MULPA(dacc3, 0, dvecData1, \
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(vecCoeffData3), 0));
+          MORPH_OP_MULPA(dacc4, 0, dvecData1, \
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(vecCoeffData4), 0));
+        } /* end of if(inCh < numInCh)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* store the first half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4
+         */
+
+        /* Storing the first row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* extract the upper half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4 and store
+         * in the next output height
+         */
+        dvecOut1L = IVP_SEL2NX8I(0, dvecOut1L, IVP_SELI_8B_EXTRACT_HI_HALVES);
+        dvecOut2L = IVP_SEL2NX8I(0, dvecOut2L, IVP_SELI_8B_EXTRACT_HI_HALVES);
+        dvecOut3L = IVP_SEL2NX8I(0, dvecOut3L, IVP_SELI_8B_EXTRACT_HI_HALVES);
+        dvecOut4L = IVP_SEL2NX8I(0, dvecOut4L, IVP_SELI_8B_EXTRACT_HI_HALVES);
+
+        /* Storing the 2nd row outputs, 1st channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, (-typeFlag + 1) * varLen * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, (-typeFlag + 1) * \
+                       varLen * enable2ndCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable2ndCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, (-typeFlag + 1) * \
+                       varLen * enable3rdCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable3rdCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, (-typeFlag + 1) * \
+                       varLen * enable4thCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable4thCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 4 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 4 * coeffPitch3;
+      }   /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/
+    }     /* end of for (y = 0; y < outH; y += 2)*/
+  }       /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+* MOW fold 16 Stride 1                                                                    *
+* If inDataPitch1 is lesser than or equal to                                              *
+* 16 this function is called.                                                             *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_2x2j1d1), S8IX_MOW_WHD_FOLD16) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag  = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  int32_t leftEdge, topEdge;
+
+  leftEdge = leftEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1);
+  topEdge  = topEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1);
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 *restrict pdvecIn1;
+  MORPH_IDT_2Nx8 *restrict pdvecIn2;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, y;
+
+  /* loop across output channels and output height are unrolled twice
+   * to produce four output vectors in 1 iteration
+   */
+  for (y = 0; y < outH - 3; y += 4)   /* Loop across output height */
+  {
+    /* initialize output data pointer */
+    int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel];
+
+    /* initialize coeff and Bias data pointer */
+    int8_t *pCoeff = &pCoeffData[0];
+    int32_t *pBias = &pBiasData[0];
+
+    for (outCh = 0; outCh < numOutCh; outCh += 2)    /* Loop across Output depth */
+    {
+      /* In order to handle odd output depths */
+      int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+      /* Load the bias values corresponding to two output channels */
+      xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+      xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+      /* wide vectors(accumulators) initialized with bias */
+      xb_vec2Nx24 dacc1, dacc2;
+
+      dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+      IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+
+      dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+      IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+
+      /* priming of coeff load is done outside the innermost loop*/
+      pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+      valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+      pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+      valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y];
+
+      pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+      pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1);
+
+      for (inCh = 0; inCh < numInCh - 1; inCh += 2)    /* Loop across input channels */
+      {
+        /* vectors for coeff and input loads */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+        MORPH_IDT_2Nx8 dvecInData1, dvecInData2;
+        MORPH_IDT_2Nx8 dvecInData1temp, dvecInData2temp;
+
+        /* Process first input channel */
+        /* load data from 4 input rows [Row0 | Row1 | Row2 | Row3] */
+        valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, inDataPitch2);
+        dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* load data from next 4 input rows [Row1 | Row2 | Row3 | Row4] */
+        valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData2, pdvecIn2, inDataPitch2);
+        dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* load all the 2x2 coefficients for 2 output depths*/
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 8);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 8);
+
+        /* Get co-efficients for first channel */
+        int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0);
+        int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0);
+
+        /* Compute Row 1 of 1st output channel */
+        MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1);
+
+        /* Compute Row 1 of 2nd output channel */
+        MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2);
+
+        /* Process second input channel */
+        /* load data from 4 input rows [Row0 | Row1 | Row2 | Row3] */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, inDataPitch2);
+        dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* load data from 4 input rows [Row1 | Row2 | Row3 | Row4] */
+        vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData2, pdvecIn2, inDataPitch2);
+        dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* Get co-efficients for second channel */
+        qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1);
+        qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1);
+
+        /* Compute Row 1 of 1st output channel */
+        MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1);
+
+        /* Compute Row 1 of 2nd output channel */
+        MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2);
+      }    /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+      if (inCh < numInCh)
+      {
+        /* vectors for coeff and input loads */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+        MORPH_IDT_2Nx8 dvecInData1, dvecInData2;
+        MORPH_IDT_2Nx8 dvecInData1temp, dvecInData2temp;
+
+        /* load data from 2 input rows [Row0 | Row1 | Row2 | Row3] */
+        valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, inDataPitch2);
+        dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* load data from 2 input rows [Row1 | Row2 | Row3 | Row4] */
+        valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData2, pdvecIn2, inDataPitch2);
+        dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* load all the 2x2 coefficients for 2 output depths */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 4);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 4);
+
+        int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0);
+        int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0);
+
+        /* Compute Row 1 of 1st output channel */
+        MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1);
+
+        /* Compute Row 1 of 2nd output channel */
+        MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2);
+      }
+
+      /* Pack, Output Scale, Output Shift and clamping */
+      xb_vec2Nx8 dvecOut1L, dvecOut2L;
+      xb_vec2Nx8 dvecOut1H, dvecOut2H;
+#if DILATED_VQ_CONV == VQ_TRUE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+      /* Storing the first output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput);
+      valign vaOutData = IVP_ZALIGN();
+      IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, third row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch1 * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, fourth row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch1 * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)),
+                     vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+      IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            outDataPitch1) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, third row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            2 * outDataPitch1) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, fourth row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            3 * outDataPitch1) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      pOutput += 2 * outDataPitch2 * bytesPerPixel;
+      pCoeff  += 2 * coeffPitch3;
+    }  /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+  }    /* end of for (y = 0; y < outH-3; y += 4)*/
+
+  /* handle left out output rows */
+  if (y < outH)
+  {
+    int32_t enable2ndRow = XT_SALT(y, outH - 1);
+    int32_t enable3rdRow = XT_SALT(y, outH - 2);
+
+    /* initialize output data pointer */
+    int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel];
+
+    /* initialize coeff and Bias data pointer */
+    int8_t *pCoeff = &pCoeffData[0];
+    int32_t *pBias = &pBiasData[0];
+
+    for (outCh = 0; outCh < numOutCh; outCh += 2)    /* Loop across Output depth */
+    {
+      /* In order to handle odd output depths and heights */
+      int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+      /* Load the bias values corresponding to two output channels */
+      xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+      xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+      /* wide vectors(accumulators) initialized with bias */
+      xb_vec2Nx24 dacc1, dacc2;
+
+      dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+      IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+
+      dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+      IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+
+      /* priming of coeff load is done outside the innermost loop*/
+      pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+      valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+      pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+      valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y];
+
+      for (inCh = 0; inCh < numInCh - 1; inCh += 2)    /* Loop across input channels */
+      {
+        /* vectors for coeff and input loads */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+        MORPH_IDT_2Nx8 dvecInData1, dvecInData2;
+        MORPH_IDT_2Nx8 dvecInData1temp, dvecInData2temp;
+
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+        pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1);
+
+        /* load data from first 4 input rows [Row0 | Row1 | Row2 | Row3] */
+        valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData1, vaInData1, pdvecIn1, (2 + enable2ndRow + enable3rdRow) * inDataPitch1);
+        dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* load data from next 4 input rows [Row1 | Row2 | Row3 | Row4]*/
+        valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData2, vaInData2, pdvecIn2, (1 + enable2ndRow + enable3rdRow) * inDataPitch1);
+        dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+        pInput         += inDataPitch2;
+
+        /* load all the 2x2 coefficients for 2 output depths*/
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 8);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 8 * enable2ndCh);
+
+        int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0);
+        int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0);
+
+        /* Compute Row 1 of 1st output channel */
+        MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1);
+
+        /* Compute Row 1 of 2nd output channel */
+        MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2);
+
+        /* Process next input channel */
+        /* load data from 2 input rows [Row0 | Row1 | Row2 | Row3] */
+        pdvecIn1  = (MORPH_IDT_2Nx8 *) (pInput);
+        pdvecIn2  = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1);
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData1, vaInData1, pdvecIn1, (2 + enable2ndRow + enable3rdRow) * inDataPitch1);
+        dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* load data from 2 input rows [Row4 | Row5 | Row6 | Row7] */
+        vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData2, vaInData2, pdvecIn2, (1 + enable2ndRow + enable3rdRow) * inDataPitch1);
+        dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+        pInput         += inDataPitch2;
+
+        qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1);
+        qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1);
+
+        /* Compute Row 1 of 1st output channel */
+        MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1);
+
+        /* Compute Row 1 of 2nd output channel */
+        MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2);
+      }    /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+      if (inCh < numInCh)
+      {
+        /* vectors for coeff and input loads */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+        MORPH_IDT_2Nx8 dvecInData1, dvecInData2;
+        MORPH_IDT_2Nx8 dvecInData1temp, dvecInData2temp;
+
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+        pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1);
+
+        /* load data from 2 input rows [Row0 | Row1 | Row2 | Row3] */
+        valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData1, vaInData1, pdvecIn1, (2 + enable2ndRow + enable3rdRow) * inDataPitch1);
+        dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* load data from 2 input rows [Row4 | Row5 | Row6 | Row7] */
+        valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData2, vaInData2, pdvecIn2, (1 + enable2ndRow + enable3rdRow) * inDataPitch1);
+        dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+        pInput         += inDataPitch2;
+
+        /* load all the 2x2 coefficients for 2 output depths*/
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 4);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 4);
+
+        int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0);
+        int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0);
+
+        /* Compute Row 1 of 1st output channel */
+        MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1);
+
+        /* Compute Row 1 of 2nd output channel */
+        MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2);
+      }
+
+      /* Pack, Output Scale, Output Shift and clamping */
+      xb_vec2Nx8 dvecOut1L, dvecOut2L;
+      xb_vec2Nx8 dvecOut1H, dvecOut2H;
+#if DILATED_VQ_CONV == VQ_TRUE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+      /* Storing the first output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput);
+      valign vaOutData = IVP_ZALIGN();
+      IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, 3rd row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch1 * enable3rdRow * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable3rdRow * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+      IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            outDataPitch1 * enable2ndRow) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * enable2ndRow * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, 3rd row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            2 * outDataPitch1 * enable3rdRow) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * enable3rdRow * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      pOutput += 2 * outDataPitch2 * bytesPerPixel;
+      pCoeff  += 2 * coeffPitch3;
+    } /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+  }   /* end of if(y < outH)*/
+}
+
+/******************************************************************************************
+* MOW fold 32 Stride 1                                                                    *
+* If inDataPitch1 is lesser than or equal to                                              *
+* 32 this function is called.                                                             *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_2x2j1d1), S8IX_MOW_WHD_FOLD32) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag  = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  int32_t leftEdge, topEdge;
+
+  leftEdge = leftEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1);
+  topEdge  = topEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1);
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 *restrict pdvecIn1;
+  MORPH_IDT_2Nx8 *restrict pdvecIn2;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, y;
+
+  /* loop across output channels and output height are unrolled twice
+   * to produce four output vectors in 1 iteration
+   */
+  for (y = 0; y < outH; y += 2)   /* Loop across output height */
+  {
+    /* in order to hanlde odd output height */
+    int32_t enable2Row = XT_SALT(y, outH - 1);
+
+    /* initialize output data pointer */
+    int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel];
+
+    /* initialize coeff and Bias data pointer */
+    int8_t *pCoeff = &pCoeffData[0];
+    int32_t *pBias = &pBiasData[0];
+
+    for (outCh = 0; outCh < numOutCh; outCh += 2)    /* Loop across Output depth */
+    {
+      /* In order to handle odd output depths */
+      int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+      /* Load the bias values corresponding to two output channels */
+      xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+      xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+      /* wide vectors(accumulators) initialized with bias */
+      xb_vec2Nx24 dacc1, dacc2;
+
+      dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+      IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+
+      dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+      IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+
+      /* priming of coeff load is done outside the innermost loop*/
+      pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+      valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+      pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+      valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y];
+
+      /* load data from first 2 input rows */
+      pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+      pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1);
+
+
+      for (inCh = 0; inCh < numInCh - 1; inCh += 2)    /* Loop across input channels */
+      {
+        /* vectors for coeff and input loads */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+        MORPH_IDT_2Nx8 dvecInData1, dvecInData2;
+        MORPH_IDT_2Nx8 dvecInData1temp, dvecInData2temp;
+
+        /* load data from 2 input rows [Row0 | Row1] */
+        valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, inDataPitch2);
+        dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* load data from 2 input rows [Row1 | Row2] */
+        valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData2, pdvecIn2, inDataPitch2);
+        dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* load all the 2x2 coefficients for 2 output depths*/
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 8);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 8);
+
+        int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0);
+        int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0);
+
+        /* Compute Row 1 of 1st channel */
+        MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1);
+
+        /* Compute Row 2 of 1st channel */
+        MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2);
+
+        /* Process input channel2 */
+        /* load data from 2 input rows [Row0 | Row1] */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, inDataPitch2);
+        dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* load data from 2 input rows [Row1 | Row2] */
+        vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData2, pdvecIn2, inDataPitch2);
+        dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1);
+        qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1);
+
+        /* Compute Row 1 of 1st channel */
+        MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1);
+
+        /* Compute Row 2 of 1st channel */
+        MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2);
+      }    /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+      if (inCh < numInCh)
+      {
+        /* vectors for coeff and input loads */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+        MORPH_IDT_2Nx8 dvecInData1, dvecInData2;
+        MORPH_IDT_2Nx8 dvecInData1temp, dvecInData2temp;
+
+        /* load data from 2 input rows [Row0 | Row1] */
+        valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, inDataPitch2);
+        dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* load data from 2 input rows [Row1 | Row2] */
+        valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData2, pdvecIn2, inDataPitch2);
+        dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* load all the 2x2 coefficients for 2 output depths*/
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 8);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 8);
+
+        int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0);
+        int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0);
+
+        /* Compute Row 1 of 1st channel */
+        MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1);
+
+        /* Compute Row 2 of 1st channel */
+        MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2);
+      }
+
+      /* Pack, Output Scale, Output Shift and clamping */
+      xb_vec2Nx8 dvecOut1L, dvecOut2L;
+      xb_vec2Nx8 dvecOut1H, dvecOut2H;
+#if DILATED_VQ_CONV == VQ_TRUE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+      /* Storing the first output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput);
+      valign vaOutData = IVP_ZALIGN();
+      IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2Row * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+      IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            outDataPitch1 * enable2Row) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)),
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \
+                     enable2Row * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      pOutput += 2 * outDataPitch2 * bytesPerPixel;
+      pCoeff  += 2 * coeffPitch3;
+    }  /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+  }    /* end of for (y = 0; y < outH; y += 2)*/
+}
+
+
+/*****************************************************************************
+*  xaiConvolved(VQ)3D_S_2x2j1d1I8S8IX_MOW_WHD
+*  **************************************************************************/
+/********************************************************************************/
+/* Description : P6 optimized generic implementation for 2x2 3D convolution with*/
+/*               dilation = 1. Based on MORPH pre-processor specifiers, code    */
+/*               implementation is generated during preprocessing stage. This   */
+/*               method can be used to generate 2x2 3D MOW_WHD convolution      */
+/*               function and 2x2 3D VQ MOW_WHD convolution function for U8 bit */
+/*               and S8 bit input data with input stride equal to 1             */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                  */
+/*               Output scale array, CNN convolution params structure           */
+/* Outputs     : XI Error Code                                                  */
+/* InOuts      : Output Tile                                                    */
+/* Assumptions : CoeffData is S8                                                */
+/*               biasArray is signed 32b, value not exceeding signed 24b        */
+/*               OutData is S8 / U8 / S16                                       */
+/*               Kernel Size is 2x2xDxN                                         */
+/*               Input and Output are in WHD format                             */
+/*               Coeff is in WHDN format                                        */
+/********************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_2x2j1d1_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_2x2j1d1_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_2x2j1d1_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_2x2j1d1_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_2x2j1d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 2);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                           \
+                    XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_STRIDE(param, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param);
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+  /* check inDataPitch1, if it is less than or equal to 32,
+   * call FOLD32 variant otherwise continue
+   */
+
+  if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 16)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_2x2j1d1), S8IX_MOW_WHD_FOLD16) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+  if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 32)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_2x2j1d1), S8IX_MOW_WHD_FOLD32) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag  = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  int32_t leftEdge, topEdge;
+
+  leftEdge = leftEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1);
+  topEdge  = topEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1);
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 *restrict pdvecIn1;
+  MORPH_IDT_2Nx8 *restrict pdvecIn2;
+  MORPH_IDT_2Nx8 *restrict pdvecIn3;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+  /* In order to make the loop multiply-bound we are reducing the vectorization width
+     by extra values required for the kernel */
+  const int32_t vectorizationWidth = ((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1;
+
+  /* loop across output channels and output height are unrolled twice
+   * to produce four output vectors in 1 iteration
+   */
+  for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */
+  {
+    for (y = 0; y < outH; y += 2)   /* Loop across output height */
+    {
+      /* in order to handle odd output height */
+      int32_t enable2Row = XT_SALT(y, outH - 1);
+
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+      /* initialize coeff and Bias data pointer */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 2)    /* Loop across Output depth */
+      {
+        /* In order to handle odd output depths */
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+        /* Load the bias values corresponding to two output channels */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 daccSum11, daccSum21, daccSum12, daccSum22;
+
+        daccSum11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(daccSum11, hvecBias1, hvecBias1);
+
+        daccSum12 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(daccSum12, hvecBias1, hvecBias1);
+
+        daccSum21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(daccSum21, hvecBias2, hvecBias2);
+
+        daccSum22 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(daccSum22, hvecBias2, hvecBias2);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + (coeffPitch3 * enable2ndCh));
+
+        /* Input vector pointer initialization- 1st input channel */
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+        pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1);
+        pdvecIn3 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch1 * enable2Row);
+
+        for (inCh = 0; inCh < numInCh - 1; inCh += 2)    /* Loop across input channels */
+        {
+          /* vectors for coeff and input loads */
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+          MORPH_IDT_2Nx8 dvecInData1, dvecInData2, dvecInData3;
+
+          /* load all the 2x2 coefficients for 1st output channel*/
+          valign vaCoeffData;
+          vaCoeffData = IVP_LA2NX8_PP(pdvecCoeff1);
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData, pdvecCoeff1, 8);
+
+          /* load all the 2x2 coefficients for 2nd output channel*/
+          vaCoeffData = IVP_LA2NX8_PP(pdvecCoeff2);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData, pdvecCoeff2, 8);
+
+          /* load data from first input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn1, inDataPitch2);
+
+          /* load data from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn2, inDataPitch2);
+
+          /* load data from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn3);
+          MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn3, inDataPitch2);
+
+          MORPH_IDT_2Nx8 dvecInData1temp, dvecInData2temp, dvecInData3temp;
+
+          /* Reorder/ rotate the input required for filter kernel computation */
+          dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+          dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+          dvecInData3temp = IVP_SEL2NX8I(dvecInData3, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0);
+
+          /* Compute Row 1 of 1st channel */
+          MORPH_OP_MULQA(daccSum11, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1);
+          /* Compute Row 2 of 1st channel */
+          MORPH_OP_MULQA(daccSum12, dvecInData3temp, dvecInData3, dvecInData2temp, dvecInData2, qmulScalar1);
+
+          int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0);
+
+          /* Compute Row 1 of 2nd channel */
+          MORPH_OP_MULQA(daccSum21, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2);
+          /* Compute Row 2 of 2nd channel */
+          MORPH_OP_MULQA(daccSum22, dvecInData3temp, dvecInData3, dvecInData2temp, dvecInData2, qmulScalar2);
+
+          /* load data from first input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn1, inDataPitch2);
+
+          /* load data from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn2, inDataPitch2);
+
+          /* load data from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn3);
+          MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn3, inDataPitch2);
+
+          /* Reorder/ rotate the input required for filter kernel computation */
+          dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+          dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+          dvecInData3temp = IVP_SEL2NX8I(dvecInData3, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1);
+
+          /* Compute Row 1 of 1st channel */
+          MORPH_OP_MULQA(daccSum11, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1);
+          /* Compute Row 2 of 1st channel */
+          MORPH_OP_MULQA(daccSum12, dvecInData3temp, dvecInData3, dvecInData2temp, dvecInData2, qmulScalar1);
+
+          qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1);
+
+          /* Compute Row 1 of 2nd channel */
+          MORPH_OP_MULQA(daccSum21, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2);
+          /* Compute Row 2 of 2nd channel */
+          MORPH_OP_MULQA(daccSum22, dvecInData3temp, dvecInData3, dvecInData2temp, dvecInData2, qmulScalar2);
+        }                   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+        if (inCh < numInCh) /*Control flow to handle final row for odd input channel count*/
+        {
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+          MORPH_IDT_2Nx8 dvecInData1, dvecInData2, dvecInData3;
+
+          /* load all the 2x2 coefficients for 1st output channel*/
+          valign vaCoeffData = IVP_LA2NX8_PP(pdvecCoeff1);
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData, pdvecCoeff1, 4);
+
+          /* load all the 2x2 coefficients for 2nd output channel*/
+          vaCoeffData = IVP_LA2NX8_PP(pdvecCoeff2);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData, pdvecCoeff2, 4);
+
+          /* load data from first input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn1, inDataPitch2);
+
+          /* load data from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn2, inDataPitch2);
+
+          /* load data from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn3);
+          MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn3, inDataPitch2);
+
+          MORPH_IDT_2Nx8 dvecInData1temp, dvecInData2temp, dvecInData3temp;
+
+          /* Reorder/ rotate the input required for filter kernel computation */
+          dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+          dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+          dvecInData3temp = IVP_SEL2NX8I(dvecInData3, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0);
+
+          /* Compute Row 1 of 1st channel */
+          MORPH_OP_MULQA(daccSum11, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1);
+          /* Compute Row 2 of 1st channel */
+          MORPH_OP_MULQA(daccSum12, dvecInData3temp, dvecInData3, dvecInData2temp, dvecInData2, qmulScalar1);
+
+          int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0);
+
+          /* Compute Row 1 of 2nd channel */
+          MORPH_OP_MULQA(daccSum21, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2);
+          /* Compute Row 2 of 2nd channel */
+          MORPH_OP_MULQA(daccSum22, dvecInData3temp, dvecInData3, dvecInData2temp, dvecInData2, qmulScalar2);
+        }
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut11L, dvecOut12L, dvecOut21L, dvecOut22L;
+        xb_vec2Nx8 dvecOut11H, dvecOut12H, dvecOut21H, dvecOut22H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, daccSum11, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, daccSum12, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, daccSum21, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, daccSum22, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, daccSum11, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, daccSum12, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, daccSum21, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, daccSum22, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable length for output stores */
+        int32_t varLen = XT_MIN(vectorizationWidth, outW - x);
+
+        /* Storing the first output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut11L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAV2NX8_XP(dvecOut11H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut12L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2Row);
+        IVP_SAV2NX8_XP(dvecOut12H, vaOutData, pdvecOut, typeFlag * \
+                       enable2Row * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+
+        /* Storing the second output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut21L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+        IVP_SAV2NX8_XP(dvecOut21H, vaOutData, pdvecOut, typeFlag * enable2ndCh * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2Row) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut22L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * enable2Row * varLen);
+        IVP_SAV2NX8_XP(dvecOut22H, vaOutData, pdvecOut, typeFlag * enable2ndCh * \
+                       enable2Row * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 2 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 2 * coeffPitch3;
+      }  /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+    }    /* end of for (y = 0; y < outH; y += 2)*/
+  }      /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolved(VQ)3D_S_3x3j1d1I8S8IX_MOW_WHD
+*  **************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 3x3 3D VQ convolution*/
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 3x3 MOW_WHD 3D dilated convolution function */
+/*               and 3x3 MOW_WHD 3D VQ dilated convolution function for U8    */
+/*               bit and S8 bit input data with input stride equal to 1       */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 3x3xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+/******************************************************************************************
+* MOW fold 16 Stride 1 varaint                                                            *
+* If inDataPitch1 is lesser than or equal to                                              *
+* 16 this function is called.                                                             *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_3x3j1d1), S8IX_MOW_WHD_FOLD16) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+
+  const uint8_t outShiftU  = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 *restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, y;
+
+  /* Generating the shuffle pattern for coefficent loads.
+     The idea is to populate zero value where the MUL4T should not affect
+     Pattern : 0 1 2 32 3 4 5 32 6 7 8 32 X X X .... */
+  xb_vec2Nx8 dvecIdx = IVP_SEL2NX8I(32, IVP_MOV2NX8_FROMNX16( \
+                                      IVP_MOVNX16T(IVP_ADDNX16(IVP_ANDNX16(IVP_SEQNX16(), 3),
+                                                               IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 2), 3)), 32,
+                                                   IVP_NEQNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), 3))),
+                                    IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+
+  /* Select sequence to re-arrange input data */
+  xb_vec2Nx8 dvecSeq = 0;
+  IVP_ADD2NX8T(dvecSeq, IVP_SEQ2NX8(), inDataPitch1, IVP_LT2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1));
+  IVP_ADD2NX8T(dvecSeq, IVP_SUB2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1), (2 * XCHAL_IVPN_SIMD_WIDTH), \
+               IVP_NOTB2N(IVP_LT2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1)));
+
+  /* loop across output channels is unrolled twice and
+   * loop across output height is unrolled 4 times
+   */
+  for (y = 0; y < outH - 3; y += 4)   /* Loop across output height */
+  {
+    /* initialize output data pointer */
+    int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel];
+
+    /* initialize input data pointer */
+    MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y];
+
+    /* initialize coeff and Bias data pointer */
+    int8_t *pCoeff = &pCoeffData[0];
+    int32_t *pBias = &pBiasData[0];
+
+    for (outCh = 0; outCh < numOutCh; outCh += 2)    /* Loop across Output depth */
+    {
+      /* In order to handle odd output depths and heights */
+      int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+      /* Load the bias values corresponding to two output channels */
+      xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+      xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+      /* wide vectors(accumulators) initialized with bias */
+      xb_vec2Nx24 dacc1, dacc2;
+
+      dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+      IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+
+      dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+      IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+
+      /* priming of coeff load is done outside the innermost loop*/
+      pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+      valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+      pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+      valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+      /* Input vector pointer initialization */
+      pdvecIn = (MORPH_IDT_2Nx8 *) (pInput);
+
+      for (inCh = 0; inCh < numInCh; inCh++)    /* Loop across input channels */
+      {
+        /* vectors for coeff and input loads */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+        MORPH_IDT_2Nx8 dvecInData1, dvecInData2;
+
+        /* load data from first 4 input rows */
+        valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, 4 * inDataPitch1);
+
+        /* load data from next 4 input rows */
+        MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn, inDataPitch2 - (4 * inDataPitch1));
+
+        /* load all the 3x3 coefficients for 2 output depths*/
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 9);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 9);
+
+        /* Rearrange them so that zero is inserted where the MUL4T should not have effect */
+        dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+        dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+
+        /* dvecInData1 contains first 4 input rows and
+         * dvecInData2 contains next 4 input rows.
+         * dvecInData1: row0 | row1 | row2 | row3
+         * dvecInData2: row4 | row5 | row6 | row7
+         *
+         * Input data is re arranged in such a manner that
+         * dvecTemp1 contains: row1 | row2 | row3 | row4
+         * dvecTemp2 contains: row2 | row3 | row4 | row5
+         */
+        xb_vec2Nx8 dvecTemp1, dvecTemp2;
+        dvecTemp1 = IVP_SEL2NX8(dvecInData2, dvecInData1, dvecSeq);
+        dvecTemp2 = IVP_SEL2NX8(IVP_SEL2NX8(0, dvecInData2, dvecSeq), dvecTemp1, dvecSeq);
+
+        /* Multiply input data with coefficients from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc1, dvecTemp1, dvecTemp1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+        MORPH_OP_MUL4TA(dacc1, dvecTemp2, dvecTemp2, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+        /* Multiply input data with coefficients from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+        MORPH_OP_MUL4TA(dacc2, dvecTemp1, dvecTemp1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+        MORPH_OP_MUL4TA(dacc2, dvecTemp2, dvecTemp2, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+      }    /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+      /* Pack, Output Scale, Output Shift and clamping */
+      xb_vec2Nx8 dvecOut1L, dvecOut2L;
+      xb_vec2Nx8 dvecOut1H, dvecOut2H;
+#if DILATED_VQ_CONV == VQ_TRUE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+      /* Storing the first output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput);
+      valign vaOutData = IVP_ZALIGN();
+      IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, 3rd row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch1 * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, 4th row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch1 * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)),
+                     vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+      IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            outDataPitch1) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, 3rd row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            2 * outDataPitch1) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, 4th row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            3 * outDataPitch1) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+
+      pOutput += 2 * outDataPitch2 * bytesPerPixel;
+      pCoeff  += 2 * coeffPitch3;
+    }  /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+  }    /* end of for (y = 0; y < outH - 3; y += 4)*/
+  /* handle left out output rows */
+  if (y < outH)
+  {
+    int32_t enable2ndRow = XT_SALT(y, outH - 1);
+    int32_t enable3rdRow = XT_SALT(y, outH - 2);
+    /* initialize output data pointer */
+    int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel];
+
+    /* initialize coeff and Bias data pointer */
+    int8_t *pCoeff = &pCoeffData[0];
+    int32_t *pBias = &pBiasData[0];
+
+    for (outCh = 0; outCh < numOutCh; outCh += 2)    /* Loop across Output depth */
+    {
+      /* In order to handle odd output depths and heights */
+      int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+      /* Load the bias values corresponding to two output channels */
+      xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+      xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+      /* wide vectors(accumulators) initialized with bias */
+      xb_vec2Nx24 dacc1, dacc2;
+
+      dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+      IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+
+      dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+      IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+
+      /* priming of coeff load is done outside the innermost loop*/
+      pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+      valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+      pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+      valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y];
+
+      for (inCh = 0; inCh < numInCh; inCh++)    /* Loop across input channels */
+      {
+        /* vectors for coeff and input loads */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+        MORPH_IDT_2Nx8 dvecInData1, dvecInData2;
+
+        /* load data from first 4 input rows */
+        pdvecIn = (MORPH_IDT_2Nx8 *) (pInput);
+        valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData1, vaInData, pdvecIn, (3 + enable2ndRow) * inDataPitch1);
+
+        /* load data from next 4 input rows */
+        MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData2, vaInData, pdvecIn, (enable2ndRow + enable3rdRow) * inDataPitch1);
+        pInput += inDataPitch2;
+
+        /* load all the 3x3 coefficients for 2 output depths*/
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 9);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 9);
+
+        /* Rearrange them so that zero is inserted where the MUL4T should not have effect */
+        dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+        dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+
+        /* dvecInData1 contains first 4 input rows and
+         * dvecInData2 contains next 4 input rows.
+         * dvecInData1: row0 | row1 | row2 | row3
+         * dvecInData2: row4 | row5 | row6 | row7
+         *
+         * Input data is re arranged in such a manner that
+         * dvecTemp1 contains: row1 | row2 | row3 | row4
+         * dvecTemp2 contains: row2 | row3 | row4 | row5
+         */
+        xb_vec2Nx8 dvecTemp1, dvecTemp2;
+        dvecTemp1 = IVP_SEL2NX8(dvecInData2, dvecInData1, dvecSeq);
+        dvecTemp2 = IVP_SEL2NX8(IVP_SEL2NX8(0, dvecInData2, dvecSeq), dvecTemp1, dvecSeq);
+
+        /* Multiply input data with coefficients from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc1, dvecTemp1, dvecTemp1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+        MORPH_OP_MUL4TA(dacc1, dvecTemp2, dvecTemp2, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+        /* Multiply input data with coefficients from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+        MORPH_OP_MUL4TA(dacc2, dvecTemp1, dvecTemp1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+        MORPH_OP_MUL4TA(dacc2, dvecTemp2, dvecTemp2, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+      }    /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+      /* Pack, Output Scale, Output Shift and clamping */
+      xb_vec2Nx8 dvecOut1L, dvecOut2L;
+      xb_vec2Nx8 dvecOut1H, dvecOut2H;
+#if DILATED_VQ_CONV == VQ_TRUE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+      /* Storing the first output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput);
+      valign vaOutData = IVP_ZALIGN();
+      IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, 3rd row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch1 * enable3rdRow * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable3rdRow * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+      IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            outDataPitch1 * enable2ndRow) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * enable2ndRow * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, 3rd row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            2 * outDataPitch1 * enable3rdRow) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * enable3rdRow * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+
+      pOutput += 2 * outDataPitch2 * bytesPerPixel;
+      pCoeff  += 2 * coeffPitch3;
+    } /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+  }   /* end of if(y < outH)*/
+}
+
+/******************************************************************************************
+* MOW fold 32 Stride 1                                                                    *
+* If inDataPitch1 is lesser than or equal to                                              *
+* 32 this function is called.                                                             *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_3x3j1d1), S8IX_MOW_WHD_FOLD32) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+
+  const uint8_t outShiftU  = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 *restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, y;
+
+  /* Generating the shuffle pattern for coefficent loads.
+     The idea is to populate zero value where the MUL4T should not affect
+     Pattern : 0 1 2 32 3 4 5 32 6 7 8 32 X X X .... */
+  xb_vec2Nx8 dvecIdx = IVP_SEL2NX8I(32, IVP_MOV2NX8_FROMNX16( \
+                                      IVP_MOVNX16T(IVP_ADDNX16(IVP_ANDNX16(IVP_SEQNX16(), 3),
+                                                               IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 2), 3)), 32,
+                                                   IVP_NEQNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), 3))),
+                                    IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+
+  /* Select sequence to re-arrange input data */
+  xb_vec2Nx8 dvecSeq = 0;
+  IVP_ADD2NX8T(dvecSeq, IVP_SEQ2NX8(), inDataPitch1, IVP_LT2NX8(IVP_SEQ2NX8(), inDataPitch1));
+  IVP_ADD2NX8T(dvecSeq, IVP_SUB2NX8(IVP_SEQ2NX8(), inDataPitch1), (2 * XCHAL_IVPN_SIMD_WIDTH), \
+               IVP_NOTB2N(IVP_LT2NX8(IVP_SEQ2NX8(), inDataPitch1)));
+
+
+  /* loop across output channels and output height are unrolled twice
+   * to produce four output vectors in 1 iteration
+   */
+  for (y = 0; y < outH; y += 2)   /* Loop across output height */
+  {
+    /* in order to hanlde odd output height */
+    int32_t enable2Row = XT_SALT(y, outH - 1);
+    /* initialize output data pointer */
+    int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel];
+
+    /* initialize coeff and Bias data pointer */
+    int8_t *pCoeff = &pCoeffData[0];
+    int32_t *pBias = &pBiasData[0];
+
+    for (outCh = 0; outCh < numOutCh; outCh += 2)    /* Loop across Output depth */
+    {
+      /* In order to handle odd output depths */
+      int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+      /* Load the bias values corresponding to two output channels */
+      xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+      xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+      /* wide vectors(accumulators) initialized with bias */
+      xb_vec2Nx24 dacc1, dacc2;
+
+      dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+      IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+
+      dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+      IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+
+      /* priming of coeff load is done outside the innermost loop*/
+      pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+      valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+      pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+      valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y];
+
+      for (inCh = 0; inCh < numInCh; inCh++)    /* Loop across input channels */
+      {
+        /* vectors for coeff and input loads */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+        MORPH_IDT_2Nx8 dvecInData1, dvecInData2;
+
+        /* load data from first 2 input rows */
+        pdvecIn = (MORPH_IDT_2Nx8 *) (pInput);
+        valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, 2 * inDataPitch1);
+
+        /* load data from next 2 input rows */
+        MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData2, vaInData, pdvecIn, inDataPitch1 \
+                                    + inDataPitch1 * enable2Row);
+        pInput += inDataPitch2;
+
+        /* load all the 3x3 coefficients for 2 output depths*/
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 9);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 9);
+
+        /* Rearrange them so that zero is inserted where the MUL4T should not have effect */
+        dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+        dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+
+        /* dvecInData1 contains first 2 input rows and
+         * dvecInData2 contains next 2 input rows.
+         * dvecInData1: row0 | row1
+         * dvecInData2: row2 | row3
+         *
+         * dvecInData1 is multipled with 1st row of coffecient and
+         * dvecInData2 is multipled with 3rd row of coeffecient.
+         *
+         * To multiply input data with 2nd coefficient row, it is required
+         * to store row1 and row 2 in another vector
+         *
+         * dvecTemp: row1 | row2
+         *
+         * So first inDataPitch1 elements in the accumulator corresponds to
+         * first output row and next inDataPitch1 number of elements corresponds
+         * to 2nd output row.
+         */
+        xb_vec2Nx8 dvecTemp1;
+        dvecTemp1 = IVP_SEL2NX8(dvecInData2, dvecInData1, dvecSeq);
+
+        /* Multiply input data with coefficients from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc1, dvecTemp1, dvecTemp1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+        MORPH_OP_MUL4TA(dacc1, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+        /* Multiply input data with coefficients from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+        MORPH_OP_MUL4TA(dacc2, dvecTemp1, dvecTemp1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+        MORPH_OP_MUL4TA(dacc2, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+      }    /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+      /* Pack, Output Scale, Output Shift and clamping */
+      xb_vec2Nx8 dvecOut1L, dvecOut2L;
+      xb_vec2Nx8 dvecOut1H, dvecOut2H;
+#if DILATED_VQ_CONV == VQ_TRUE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+
+      /* Storing the first output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput);
+      valign vaOutData = IVP_ZALIGN();
+      IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2Row * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+
+      /* Storing the second output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+      IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            outDataPitch1 * enable2Row) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)),
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \
+                     enable2Row * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      pOutput += 2 * outDataPitch2 * bytesPerPixel;
+      pCoeff  += 2 * coeffPitch3;
+    }  /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+  }    /* end of for (y = 0; y < outH; y += 2)*/
+}
+
+/****************** xaiConvolvedVQ3D_S_3x3j1d1_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_3x3j1d1_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_3x3j1d1_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_3x3j1d1_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_3x3j1d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 3);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                           \
+                    XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONX(param));
+    XAI_CHECK_STRIDE(param, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_TILE3D_EDGE(inTile, 1);
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+  /* check inDataPitch1, if it is less than or equal to 16,
+   * call FOLD16 varaint and if it's greater than
+   * 16 but less than or equal to 32 call FOLD32 variant otherwise continue
+   */
+
+  if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 16)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_3x3j1d1), S8IX_MOW_WHD_FOLD16) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+  if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 32)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_3x3j1d1), S8IX_MOW_WHD_FOLD32) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+
+  const uint8_t outShiftU  = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 *restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+  /* In order to make the loop multiply-bound we are reducing the vectorization width
+     by extra values required for the kernel */
+  const int32_t vectorizationWidth = ((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1;
+
+  /* Generating the shuffle pattern for coefficent loads.
+     The idea is to populate zero value where the MUL4T should not affect
+     Pattern : 0 1 2 32 3 4 5 32 6 7 8 32 X X X .... */
+  xb_vec2Nx8 dvecIdx = IVP_SEL2NX8I(32, IVP_MOV2NX8_FROMNX16( \
+                                      IVP_MOVNX16T(IVP_ADDNX16(IVP_ANDNX16(IVP_SEQNX16(), 3),
+                                                               IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 2), 3)), 32,
+                                                   IVP_NEQNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), 3))),
+                                    IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+  /* loop across output channels and output height are unrolled twice
+   * to produce four output vectors in 1 iteration
+   */
+  for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */
+  {
+    for (y = 0; y < outH; y += 2)   /* Loop across output height */
+    {
+      /* in order to handle odd output height */
+      int32_t enable2Row = XT_SALT(y, outH - 1);
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+      /* initialize coeff and Bias data pointer */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 2)    /* Loop across Output depth */
+      {
+        /* In order to handle odd output depths */
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+        /* Load the bias values corresponding to two output channels */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+        dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+        dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        /* Input vector pointer initialization */
+        pdvecIn = (MORPH_IDT_2Nx8 *) (pInput);
+
+        for (inCh = 0; inCh < numInCh; inCh++)    /* Loop across input channels */
+        {
+          /* vectors for coeff and input loads */
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+          MORPH_IDT_2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4;
+
+          /* load data from first input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, inDataPitch1);
+
+          /* load data from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn, inDataPitch1);
+
+          /* load data from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn, inDataPitch1 * enable2Row);
+
+          /* load data from 4th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData4, vaInData, pdvecIn, inDataPitch2 - (2 + enable2Row) * inDataPitch1);
+
+          /* load all the 3x3 coefficients for 2 output depths*/
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 9);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 9);
+
+          /* Rearrange them so that zero is inserted where the MUL4T should not have effect */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+
+          /* Multiply and accumulate 1st set of 3 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc12, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc21, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MUL4TA(dacc22, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+          /* Multiply and accumulate 2nd set of 3 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc12, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc21, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+          MORPH_OP_MUL4TA(dacc22, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+          /* Multiply and accumulate 3rd set of 3 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc12, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc21, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+          MORPH_OP_MUL4TA(dacc22, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+        }    /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable length for output stores */
+        int32_t varLen = XT_MIN(vectorizationWidth, outW - x);
+
+        /* Storing the first output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2Row * varLen);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * enable2Row * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * enable2ndCh * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2Row) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \
+                       enable2Row * varLen);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * enable2ndCh * \
+                       enable2Row * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 2 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 2 * coeffPitch3;
+      }  /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+    }    /* end of for (y = 0; y < outH; y += 2)*/
+  }      /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolved(VQ)3D_S_3x3j2d1I8S8IX_MOW_WHD
+*  **************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized implementation for 3x3 3D convolution with      */
+/*               stride = 2. Based on MORPH pre-processor specifiers, code    */
+/*               implementation is generated during preprocessing stage. This */
+/*               method can be used to generate 3x3 3D dilated convolution    */
+/*               function and 3x3 3D VQ dilated convolution function for U8   */
+/*               bit and S8 bit input data.                                   */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 3x3xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/******************* convolvedVQ3D_S_3x3j2d1_S8S8IX_MOW_WHD_INCHANNEL3 *****************/
+/******************* convolvedVQ3D_S_3x3j2d1_U8S8IX_MOW_WHD_INCHANNEL3 *****************/
+/******************* convolved3D_S_3x3j2d1_S8S8IX_MOW_WHD_INCHANNEL3   *****************/
+/******************* convolved3D_S_3x3j2d1_U8S8IX_MOW_WHD_INCHANNEL3   *****************/
+/*                  If number of input channel is 3 this function is called              */
+/*****************************************************************************************/
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_3x3j2d1), S8IX_MOW_WHD_INCHANNEL3) \
+  MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8* restrict pdvecIn1;
+  MORPH_IDT_2Nx8* restrict pdvecIn2;
+  MORPH_IDT_2Nx8* restrict pdvecIn3;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vecN_2x32v* restrict phvecBias;
+  int32_t outCh, y, x;
+
+  xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3;
+  /* inCh = 1 */
+  MORPH_IDT_2Nx8 dvecDataCh101, dvecDataCh102, dvecDataCh103;
+  MORPH_IDT_2Nx8 dvecDataCh111, dvecDataCh112, dvecDataCh113;
+  MORPH_IDT_2Nx8 dvecDataCh121, dvecDataCh122, dvecDataCh123;
+  /* inCh = 2 */
+  MORPH_IDT_2Nx8 dvecDataCh201, dvecDataCh202, dvecDataCh203;
+  MORPH_IDT_2Nx8 dvecDataCh211, dvecDataCh212, dvecDataCh213;
+  MORPH_IDT_2Nx8 dvecDataCh221, dvecDataCh222, dvecDataCh223;
+  /* inCh = 3 */
+  MORPH_IDT_2Nx8 dvecDataCh301, dvecDataCh302, dvecDataCh303;
+  MORPH_IDT_2Nx8 dvecDataCh311, dvecDataCh312, dvecDataCh313;
+  MORPH_IDT_2Nx8 dvecDataCh321, dvecDataCh322, dvecDataCh323;
+
+  /* input vectors for inCh = 1 */
+  xb_vec2Nx8 dvecInDataCh1_1, dvecInDataCh1_2, dvecInDataCh1_3, dvecInDataCh1_4, dvecInDataCh1_5;
+  /* input vectors for inCh = 2 */
+  xb_vec2Nx8 dvecInDataCh2_1, dvecInDataCh2_2, dvecInDataCh2_3, dvecInDataCh2_4, dvecInDataCh2_5;
+  /* input vectors for inCh = 3 */
+  xb_vec2Nx8 dvecInDataCh3_1, dvecInDataCh3_2, dvecInDataCh3_3, dvecInDataCh3_4, dvecInDataCh3_5;
+
+  const int32_t vectorizationWidth = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1;
+
+  /* Generating the select pattern for coefficent loads.
+   * Pattern : 27, 28, 29, 30, ....
+   */
+  xb_vec2Nx8 dvecSeq = IVP_ADD2NX8(IVP_SEQ2NX8(), coeffPitch3);
+
+  for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */
+  {
+    /* variable length for output stores */
+    int32_t varLen = XT_MIN(vectorizationWidth, outW - x);
+    int32_t remX   = bytesPerPixel * varLen;
+
+    for (y = 0; y < outH; y += 2)
+    {
+      /* in order to handle odd output height */
+      int32_t enable2outH = XT_SALT(y, outH - 1);
+      int32_t remLoad     = inDataPitch1 * enable2outH;
+
+      /* variables used for store */
+      int32_t outVarLen    = varLen * enable2outH;
+      int32_t outVarFlag   = outVarLen * typeFlag;
+      int32_t outVarFlagx2 = outVarFlag * 2;
+      enable2outH = outDataPitch1 * enable2outH * bytesPerPixel;
+
+      /* Initialize input and output data pointers */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* input pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y * stride + x * stride];
+
+      /* Load input data */
+      /* InCh =1 */
+      pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+      /* load data from 1st input row */
+      valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+      IVP_LA2NX8_XP(dvecInDataCh1_1, vaInData, pdvecIn1, inDataPitch1);
+
+      /* load data from 2nd input row */
+      vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+      IVP_LA2NX8_XP(dvecInDataCh1_2, vaInData, pdvecIn1, inDataPitch1);
+
+      /* load data from 3rd input row */
+      vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+      IVP_LA2NX8_XP(dvecInDataCh1_3, vaInData, pdvecIn1, remLoad);
+
+      /* load data from 4th input row */
+      vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+      IVP_LA2NX8_XP(dvecInDataCh1_4, vaInData, pdvecIn1, remLoad);
+
+      /* load data from 5th input row */
+      vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+      IVP_LA2NX8_XP(dvecInDataCh1_5, vaInData, pdvecIn1, remLoad);
+
+      /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+       * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...63, and the 2nd input row is
+       * 64,65,66,67.........126,127, Data should be arranged  as
+       *
+       * dvecIn100 : 0, 2, 4,...58,60,62,64,66,68,...122,124,126
+       * dvecIn101 : 1, 3, 5,...59,61,63,65,67,69,...123,125,127
+       * dvecIn102 : 2, 4, 6,...60,62,0 ,66,68,70,...124,126,0
+       *
+       *
+       * Lower half of the vectors contain data from 1st input row and
+       * upper half of the vectors contain data from 3rd input row.
+       *
+       */
+      /* Form 2 vectors from the 2 output height rows - row 1 and row3 */
+      IVP_DSEL2NX8I(dvecDataCh102, dvecDataCh101, dvecInDataCh1_3, dvecInDataCh1_1, IVP_DSELI_8B_DEINTERLEAVE_1);
+      dvecDataCh103 = IVP_SEL2NX8I(dvecDataCh101, dvecDataCh101, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+      /* Form 2 vectors from the 2 output height rows - row 2 and row4 */
+      IVP_DSEL2NX8I(dvecDataCh112, dvecDataCh111, dvecInDataCh1_4, dvecInDataCh1_2, IVP_DSELI_8B_DEINTERLEAVE_1);
+      dvecDataCh113 = IVP_SEL2NX8I(dvecDataCh111, dvecDataCh111, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+      /* Form 2 vectors from the 2 output height rows - row 3 and row5 */
+      IVP_DSEL2NX8I(dvecDataCh122, dvecDataCh121, dvecInDataCh1_5, dvecInDataCh1_3, IVP_DSELI_8B_DEINTERLEAVE_1);
+      dvecDataCh123 = IVP_SEL2NX8I(dvecDataCh121, dvecDataCh121, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+      /* InCh = 2 */
+      pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2);
+      /* load data from 1st input row */
+      valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+      IVP_LA2NX8_XP(dvecInDataCh2_1, vaInData2, pdvecIn2, inDataPitch1);
+
+      /* load data from 2nd input row */
+      vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+      IVP_LA2NX8_XP(dvecInDataCh2_2, vaInData2, pdvecIn2, inDataPitch1);
+
+      /* load data from 3rd input row */
+      vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+      IVP_LA2NX8_XP(dvecInDataCh2_3, vaInData2, pdvecIn2, remLoad);
+
+      /* load data from 4th input row */
+      vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+      IVP_LA2NX8_XP(dvecInDataCh2_4, vaInData2, pdvecIn2, remLoad);
+
+      /* load data from 5th input row */
+      vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+      IVP_LA2NX8_XP(dvecInDataCh2_5, vaInData2, pdvecIn2, remLoad);
+
+      /* Form 2 vectors from the 2 output height rows - row 1 and row3 */
+      IVP_DSEL2NX8I(dvecDataCh202, dvecDataCh201, dvecInDataCh2_3, dvecInDataCh2_1, IVP_DSELI_8B_DEINTERLEAVE_1);
+      dvecDataCh203 = IVP_SEL2NX8I(dvecDataCh201, dvecDataCh201, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+      /* Form 2 vectors from the 2 output height rows - row 2 and row4 */
+      IVP_DSEL2NX8I(dvecDataCh212, dvecDataCh211, dvecInDataCh2_4, dvecInDataCh2_2, IVP_DSELI_8B_DEINTERLEAVE_1);
+      dvecDataCh213 = IVP_SEL2NX8I(dvecDataCh211, dvecDataCh211, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+      /* Form 2 vectors from the 2 output height rows - row 3 and row5 */
+      IVP_DSEL2NX8I(dvecDataCh222, dvecDataCh221, dvecInDataCh2_5, dvecInDataCh2_3, IVP_DSELI_8B_DEINTERLEAVE_1);
+      dvecDataCh223 = IVP_SEL2NX8I(dvecDataCh221, dvecDataCh221, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+      /* InCh = 3 */
+      pdvecIn3 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+      /* load data from 1st input row */
+      valign vaInData3 = MORPH_OP_PRIME_2Nx8(pdvecIn3);
+      IVP_LA2NX8_XP(dvecInDataCh3_1, vaInData3, pdvecIn3, inDataPitch1);
+
+      /* load data from 2nd input row */
+      vaInData3 = MORPH_OP_PRIME_2Nx8(pdvecIn3);
+      IVP_LA2NX8_XP(dvecInDataCh3_2, vaInData3, pdvecIn3, inDataPitch1);
+
+      /* load data from 3rd input row */
+      vaInData3 = MORPH_OP_PRIME_2Nx8(pdvecIn3);
+      IVP_LA2NX8_XP(dvecInDataCh3_3, vaInData3, pdvecIn3, remLoad);
+
+      /* load data from 4th input row */
+      vaInData3 = MORPH_OP_PRIME_2Nx8(pdvecIn3);
+      IVP_LA2NX8_XP(dvecInDataCh3_4, vaInData3, pdvecIn3, remLoad);
+
+      /* load data from 5th input row */
+      vaInData3 = MORPH_OP_PRIME_2Nx8(pdvecIn3);
+      IVP_LA2NX8_XP(dvecInDataCh3_5, vaInData3, pdvecIn3, remLoad);
+
+      /* Form 2 vectors from the 2 output height rows - row 1 and row3 */
+      IVP_DSEL2NX8I(dvecDataCh302, dvecDataCh301, dvecInDataCh3_3, dvecInDataCh3_1, IVP_DSELI_8B_DEINTERLEAVE_1);
+      dvecDataCh303 = IVP_SEL2NX8I(dvecDataCh301, dvecDataCh301, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+      /* Form 2 vectors from the 2 output height rows - row 2 and row4 */
+      IVP_DSEL2NX8I(dvecDataCh312, dvecDataCh311, dvecInDataCh3_4, dvecInDataCh3_2, IVP_DSELI_8B_DEINTERLEAVE_1);
+      dvecDataCh313 = IVP_SEL2NX8I(dvecDataCh311, dvecDataCh311, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+      /* Form 2 vectors from the 2 output height rows - row 3 and row5 */
+      IVP_DSEL2NX8I(dvecDataCh322, dvecDataCh321, dvecInDataCh3_5, dvecInDataCh3_3, IVP_DSELI_8B_DEINTERLEAVE_1);
+      dvecDataCh323 = IVP_SEL2NX8I(dvecDataCh321, dvecDataCh321, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+      /* priming of coeff load is done outside the innermost loop*/
+      pdvecCoeff = (xb_vec2Nx8 *) (pCoeffData);
+      valign vaCoeffData = IVP_LA2NX8_PP(pdvecCoeff);
+
+      /* priming of bias load is done outside the innermost loop*/
+      phvecBias = (xb_vecN_2x32v *) (pBiasData);
+      valign vaBias = IVP_LAN_2X32_PP(phvecBias);
+
+      for (outCh = 0; outCh < numOutCh; outCh += 3)
+      {
+        /* In order to handle output depths*/
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+        int32_t out2Ch      = outDataPitch2 * enable2ndCh * bytesPerPixel;
+        int32_t out3Ch      = outDataPitch2 * enable3rdCh * bytesPerPixel * 2;
+
+        /* Load the bias values corresponding to three output channels */
+        xb_vecN_2x32v hvecBias; IVP_LAVN_2X32_XP(hvecBias, vaBias, phvecBias, 3 * 4);
+        xb_vecN_2x32v hvecBias1 = IVP_REPN_2X32(hvecBias, 0);
+        xb_vecN_2x32v hvecBias2 = IVP_REPN_2X32(hvecBias, 1);
+        xb_vecN_2x32v hvecBias3 = IVP_REPN_2X32(hvecBias, 2);
+
+        /* load all the 3x3 coefficients for outChannel - 1 and outChannel - 2 */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData, pdvecCoeff, 2 * coeffPitch3);
+        /* select 3x3 coefficients for outChannel - 2 */
+        dvecCoeffData2 = IVP_SEL2NX8(dvecCoeffData1, dvecCoeffData1, dvecSeq);
+        /* load all the 3x3 coefficients for outChannel - 3*/
+        IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData, pdvecCoeff, coeffPitch3 * enable3rdCh);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+
+        xb_vec2Nx24 dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+
+        xb_vec2Nx24 dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+
+        /* Values corresponding to first and second row are packed in one register
+           so that same coefficient will get multiplied to them */
+        /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecDataCh111, dvecDataCh103, dvecDataCh102, dvecDataCh101,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+        MORPH_OP_MULQA(dacc2, dvecDataCh111, dvecDataCh103, dvecDataCh102, dvecDataCh101,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+        MORPH_OP_MULQA(dacc3, dvecDataCh111, dvecDataCh103, dvecDataCh102, dvecDataCh101,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+
+        /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecDataCh122, dvecDataCh121, dvecDataCh113, dvecDataCh112,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MULQA(dacc2, dvecDataCh122, dvecDataCh121, dvecDataCh113, dvecDataCh112,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+        MORPH_OP_MULQA(dacc3, dvecDataCh122, dvecDataCh121, dvecDataCh113, dvecDataCh112,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+
+        /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecDataCh203, dvecDataCh202, dvecDataCh201, dvecDataCh123,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+        MORPH_OP_MULQA(dacc2, dvecDataCh203, dvecDataCh202, dvecDataCh201, dvecDataCh123,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+
+        MORPH_OP_MULQA(dacc3, dvecDataCh203, dvecDataCh202, dvecDataCh201, dvecDataCh123,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+
+        /* Multiply and accumulate 4th set of 4 coefficients for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecDataCh221, dvecDataCh213, dvecDataCh212, dvecDataCh211,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+
+        MORPH_OP_MULQA(dacc2, dvecDataCh221, dvecDataCh213, dvecDataCh212, dvecDataCh211,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+
+        MORPH_OP_MULQA(dacc3, dvecDataCh221, dvecDataCh213, dvecDataCh212, dvecDataCh211,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3));
+
+        /* Multiply and accumulate 5th set of 4 coefficients for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecDataCh302, dvecDataCh301, dvecDataCh223, dvecDataCh222,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+
+        MORPH_OP_MULQA(dacc2, dvecDataCh302, dvecDataCh301, dvecDataCh223, dvecDataCh222,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+
+        MORPH_OP_MULQA(dacc3, dvecDataCh302, dvecDataCh301, dvecDataCh223, dvecDataCh222,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 4));
+
+        /* Multiply and accumulate 6th set of 4 coefficients for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecDataCh313, dvecDataCh312, dvecDataCh311, dvecDataCh303,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+        MORPH_OP_MULQA(dacc2, dvecDataCh313, dvecDataCh312, dvecDataCh311, dvecDataCh303,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+
+        MORPH_OP_MULQA(dacc3, dvecDataCh313, dvecDataCh312, dvecDataCh311, dvecDataCh303,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 5));
+
+        /* Multiply and accumulate 7th set of 4 coefficients for all the outputs */
+        MORPH_OP_MULQA(dacc1, 0, dvecDataCh323, dvecDataCh322, dvecDataCh321,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6));
+
+        MORPH_OP_MULQA(dacc2, 0, dvecDataCh323, dvecDataCh322, dvecDataCh321,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6));
+
+        MORPH_OP_MULQA(dacc3, 0, dvecDataCh323, dvecDataCh322, dvecDataCh321,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 6));
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut1H;
+        xb_vec2Nx8 dvecOut2L, dvecOut2H;
+        xb_vec2Nx8 dvecOut3L, dvecOut3H;
+
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+
+        /* Storing the first output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, remX);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + out2Ch);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, remX * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the third output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + out3Ch);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, remX * enable3rdCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2outH);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut1L, dvecOut1L, IVP_SELI_EXTRACT_HI_HALVES),
+                       vaOutData, pdvecOut, (outVarLen - outVarFlag));
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, outVarFlagx2);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + out2Ch + enable2outH);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut2L, dvecOut2L, IVP_SELI_EXTRACT_HI_HALVES),
+                       vaOutData, pdvecOut, enable2ndCh * (outVarLen - outVarFlag));
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, enable2ndCh * outVarFlagx2);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the third output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + out3Ch + enable2outH);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut3L, dvecOut3L, IVP_SELI_EXTRACT_HI_HALVES),
+                       vaOutData, pdvecOut, enable3rdCh * (outVarLen - outVarFlag));
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, enable3rdCh * outVarFlagx2);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 3 * outDataPitch2 * bytesPerPixel;
+      }
+    }
+  }
+}
+/****************** xaiConvolvedVQ3D_S_3x3j2d1_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_3x3j2d1_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_3x3j2d1_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_3x3j2d1_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_3x3j2d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 3);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                           \
+                    XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE3D_EDGE(inTile, 1);
+    XAI_CHECK_STRIDE(param, 2);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  if (XAI_TILE3D_GET_DIM3(inTile) == 3)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_3x3j2d1), S8IX_MOW_WHD_INCHANNEL3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8* restrict pdvecIn1;
+
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+  xb_vec2Nx8* restrict pdvecCoeff3;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+  /* In order to make the loop multiply-bound we are reducing the vectorization width
+     by extra values required for the kernel */
+  const int32_t vectorizationWidth = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1;
+
+  /* Generating the shuffle pattern for coefficent loads.
+     The idea is to populate zero value where the MUL4T should not affect
+     Pattern : 0 1 2 32 3 4 5 32 6 7 8 32 X X X .... */
+  xb_vec2Nx8 dvecIdx = IVP_SEL2NX8I(32, IVP_MOV2NX8_FROMNX16( \
+                                      IVP_MOVNX16T(IVP_ADDNX16(IVP_ANDNX16(IVP_SEQNX16(), 3),
+                                                               IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 2), 3)), 32,
+                                                   IVP_NEQNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), 3))),
+                                    IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+  /* loop across output depth is unrolled by 3
+   * , producing lanes from 3 output channels
+   * in one iteration. Since vectorization width
+   * is just half the width of the accumulator,
+   * loop across output height is also unrolled by 2.
+   * Unrolling across output height makes it possible
+   * to utilize all the 64 MACs in the accumulator.
+   *
+   * Data loaded from the 2 input rows is concatenated
+   * in such a manner that lower half of the output
+   * vector gives the first output row and the upper
+   * half of the output vector gives the next output row.
+   */
+  for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */
+  {
+    for (y = 0; y < outH - 1; y += 2) /* Loop across output height */
+    {
+      /* Initialize i/p and o/p data pointers */
+      int8_t *pOutput          = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y * stride + x * stride];
+
+      /* initialize coeff and Bias data pointer */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 3)  /* Loop across Output depth */
+      {
+        /* In order to handle odd output depths*/
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+
+        /* Load the bias values corresponding to two output channels */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2, dacc3;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        /* Coeff for 1st output channel */
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+        /* Coeff for 2nd output channel */
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+        /* Coeff for 3rd output channel */
+        pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh);
+        valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3);
+
+        /* Input vector pointer initialization */
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+
+
+        for (inCh = 0; inCh < numInCh; inCh++)  /* Loop across input channels */
+        {
+          /* vectors for coeff and input loads */
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3;
+          xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, dvecInData5;
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3;
+
+          /* load data from 1st input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LA2NX8_XP(dvecInData1, vaInData, pdvecIn1, inDataPitch1);
+
+          /* load data from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LA2NX8_XP(dvecInData2, vaInData, pdvecIn1, inDataPitch1);
+
+          /* load data from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LA2NX8_XP(dvecInData3, vaInData, pdvecIn1, inDataPitch1);
+
+          /* load data from 4th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LA2NX8_XP(dvecInData4, vaInData, pdvecIn1, inDataPitch1);
+
+          /* load data from 5th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LA2NX8_XP(dvecInData5, vaInData, pdvecIn1, inDataPitch2 - (4 * inDataPitch1));
+
+          /* load all the 3x3 coefficients for 2 output depths*/
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2);
+
+          /* Rearrange them so that zero is inserted where the MULQ should not have effect */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+          dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx);
+
+          /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+           * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...63, and the 2nd input row is
+           * 64,65,66,67.........126,127, Data should be arranged  as
+           *
+           * dvecData1 : 0, 2, 4,...58,60,62,64,66,68,...122,124,126
+           * dvecData2 : 1, 3, 5,...59,61,63,65,67,69,...123,125,127
+           * dvecData3 : 3, 4, 6,...60,62,0 ,66,68,70,...124,126,0
+           *
+           *
+           * Lower half of the vectors contain data from 1st input row and
+           * upper half of the vectors contain data from 2nd output row.
+           *
+           */
+
+          /* Form 2 vectors from the 2 output height rows - row 1 and row3 */
+          IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          /* Values corresponding to first and second row are packed in one register
+             so that same coefficient will get multiplied to them */
+          /* Multiply and accumulate 1st set of 3 coefficients for all the outputs */
+          MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+
+
+          /* Form 2 vectors from the 2 output height rows - row 2 and row4 */
+          IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+          /* Multiply and accumulate 2nd set of 3 coefficients for all the outputs */
+          MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+          MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+
+
+          /* Form 2 vectors from the 2 output height rows - row 3 and row5 */
+          IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+          /* Multiply and accumulate 3rd set of 3 coefficients for all the outputs */
+          MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+          MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+        }  /* end of for (inCh = 0; inCh < numInCh; inCh++) */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable length for output stores */
+        int32_t varLen = XT_MIN(vectorizationWidth, outW - x);
+
+        /* Storing the first output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the third output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+
+        /* Storing the first output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut1L, dvecOut1L, IVP_SELI_EXTRACT_HI_HALVES),
+                       vaOutData, pdvecOut, (-typeFlag + 1) * varLen);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut2L, dvecOut2L, IVP_SELI_EXTRACT_HI_HALVES),
+                       vaOutData, pdvecOut, enable2ndCh * (-typeFlag + 1) * varLen);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, enable2ndCh * typeFlag * 2 * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the third output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut3L, dvecOut3L, IVP_SELI_EXTRACT_HI_HALVES),
+                       vaOutData, pdvecOut, enable3rdCh * (-typeFlag + 1) * varLen);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, enable3rdCh * typeFlag * 2 * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 3 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 3 * coeffPitch3;
+      }  /* end of for (outCh = 0; outCh < numOutCh; outCh += 3) */
+    }   /* end of for (y = 0; y < outH; y += 2) */
+    if (y < outH)
+    {
+      /* Initialize i/p and o/p data pointers */
+      int8_t *pOutput          = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y * stride + x * stride];
+
+      /* initialize coeff and Bias data pointer */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 3)  /* Loop across Output depth */
+      {
+        /* In order to handle odd output depths*/
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+
+        /* Load the bias values corresponding to two output channels */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2, dacc3;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        /* Coeff for 1st output channel */
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+        /* Coeff for 2nd output channel */
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+        /* Coeff for 3rd output channel */
+        pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh);
+        valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3);
+
+        /* Input vector pointer initialization */
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+
+
+        for (inCh = 0; inCh < numInCh; inCh++)  /* Loop across input channels */
+        {
+          /* vectors for coeff and input loads */
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3;
+          xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3;
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3;
+
+          /* load data from 1st input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LA2NX8_XP(dvecInData1, vaInData, pdvecIn1, inDataPitch1);
+
+          /* load data from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LA2NX8_XP(dvecInData2, vaInData, pdvecIn1, inDataPitch1);
+
+          /* load data from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LA2NX8_XP(dvecInData3, vaInData, pdvecIn1, inDataPitch2 - 2 * inDataPitch1);
+
+          /* load all the 3x3 coefficients for 2 output depths*/
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2);
+
+          /* Rearrange them so that zero is inserted where the MULQ should not have effect */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+          dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx);
+
+          /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+           * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...63, and the 2nd input row is
+           * 64,65,66,67.........126,127, Data should be arranged  as
+           *
+           * dvecData1 : 0, 2, 4,...58,60,62,64,66,68,...122,124,126
+           * dvecData2 : 1, 3, 5,...59,61,63,65,67,69,...123,125,127
+           * dvecData3 : 2, 4, 6,...60,62,0 ,66,68,70,...124,126,0
+           *
+           *
+           */
+
+          /* Form 2 vectors from the 2 output height rows - row 1 and row3 */
+          IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          /* Values corresponding to first and second row are packed in one register
+             so that same coefficient will get multiplied to them */
+          /* Multiply and accumulate 1st set of 3 coefficients for all the outputs */
+          MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+
+
+          /* Form 2 vectors from the 2 output height rows - row 2 and row4 */
+          IVP_DSEL2NX8I(dvecData2, dvecData1, 0, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+          /* Multiply and accumulate 2nd set of 3 coefficients for all the outputs */
+          MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+          MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+
+
+          /* Form 2 vectors from the 2 output height rows - row 3 and row5 */
+          IVP_DSEL2NX8I(dvecData2, dvecData1, 0, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+          /* Multiply and accumulate 3rd set of 3 coefficients for all the outputs */
+          MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+          MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+        }  /* end of for (inCh = 0; inCh < numInCh; inCh++) */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable length for output stores */
+        int32_t varLen = XT_MIN(vectorizationWidth, outW - x);
+
+        /* Storing the first output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the third output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 3 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 3 * coeffPitch3;
+      }  /* end of for (outCh = 0; outCh < numOutCh; outCh += 3) */
+    }   /* end of if(y < outH) */
+  }    /* end of for (x = 0; x < outW; x += vectorizationWidth) */
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+*   xaiConvolved(VQ)3D_S_3x3j4d1I8S8IX_MOW_WHD
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 3x3 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 3x3 3D dilated convolution function and 3x3 */
+/*               3D VQ dilated convolution function for U8 bit and  S8 bit    */
+/*               input data with input stride equal to 4                      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 3x3xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_3x3j4d1_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_3x3j4d1_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_3x3j4d1_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_3x3j4d1_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_3x3j4d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 3);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                           \
+                    XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE3D_EDGE(inTile, 1);
+    XAI_CHECK_STRIDE(param, 4);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+  /* Getting parameters from the tile structures */
+  const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \
+                      XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile);
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+
+  const uint8_t outShiftU  = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride     = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Pitches of Coefficient Data (WHDN) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  MORPH_IDT_2Nx8 *restrict pdvecInp1;
+  MORPH_IDT_2Nx8 *restrict pdvecInp2;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+
+  /* accumulators for 2 output channels */
+  xb_vec2Nx24 dacc1, dacc2;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+  const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH;
+  int32_t varLen;
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Generating the shuffle pattern for coefficent loads.
+     The idea is to populate zero value where the MUL4T should not affect
+     Pattern : 0 1 2 32 3 4 5 32 6 7 8 32 X X X .... */
+  xb_vec2Nx8 dvecIdx = IVP_SEL2NX8I(32, IVP_MOV2NX8_FROMNX16( \
+                                      IVP_MOVNX16T(IVP_ADDNX16(IVP_ANDNX16(IVP_SEQNX16(), 3),
+                                                               IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 2), 3)), 32,
+                                                   IVP_NEQNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), 3))),
+                                    IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+  /* loop across output depth is unrolled by 2
+   * , producing lanes from 2 output channels
+   * in one iteration. Since vectorization width
+   * is just half the width of the accumulator,
+   * loop across output height is also unrolled by 2.
+   * Unrolling across output height makes it possible
+   * to utilize all the 64 MACs in the accumulator.
+   *
+   * Data loaded from the 2 input rows is concatenated
+   * in such a manner that lower half of the output
+   * vector gives the first output row and the upper
+   * half of the output vector gives the next output row.
+   */
+
+  /* Loop structure Start with loop across output channels */
+  for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across output width */
+  {
+    /* out of bound flag */
+    int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+
+    for (y = 0; y < outH; y += 2)   /* Loop across output Height */
+    {
+      /* To handle the odd number of output height */
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+      /* Initialize i/p and o/p data pointers */
+      int8_t *pOutput          = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y * stride + x * stride];
+
+      /* initialize coeff and Bias data pointer */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across output channels */
+      {
+        /* To handle cases where outCh is non-multiple of 2 */
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+        /* Input data vectors to generate 2 rows of output */
+        MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3;
+        /* Input data vectors for 1st row of output */
+        xb_vec2Nx8 dvecIn11, dvecIn12;
+        /* Input data vectors for 2nd row of output */
+        xb_vec2Nx8 dvecIn21, dvecIn22;
+        /* vectors for coefficients */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+        valign vaData, vaOutData;
+
+        /* load and replicate bias data for each output channel */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+        /* Initialize all the accumulators with bias values */
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+
+        /* priming of 1st channel coeffs load */
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+        /* priming of 2nd channel coeffs load */
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        /* Starting location initialized for the input data */
+        pdvecInp1 = (MORPH_IDT_2Nx8 *) (pInput);
+        pdvecInp2 = (MORPH_IDT_2Nx8 *) (pInput + stride * inDataPitch1 * enable2ndRow);
+
+        for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+        {
+          /* Loading of coefficients for 2 output channels */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+
+          /* Rearrange them so that zero is inserted where the MULQ should not have effect */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+
+          /* Loading first output row input data */
+          vaData = MORPH_OP_PRIME_2Nx8(pdvecInp1);
+          MORPH_OP_LOAD_2Nx8(dvecIn11, vaData, pdvecInp1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecIn12, vaData, pdvecInp1, inDataPitch1 - (2 * XCHAL_IVPN_SIMD_WIDTH * flag));
+
+          /* Loading second output row input data */
+          vaData = MORPH_OP_PRIME_2Nx8(pdvecInp2);
+          MORPH_OP_LOAD_2Nx8(dvecIn21, vaData, pdvecInp2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecIn22, vaData, pdvecInp2, inDataPitch1 - (2 * XCHAL_IVPN_SIMD_WIDTH * flag));
+
+          /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+           * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is
+           * 128,129,130,131.........252,253,254,255, Data should be arranged  as
+           *
+           * dvecData1 : 0, 4, 8,...124,128,132,136,...252
+           * dvecData2 : 1, 5, 9,...125,129,133,137,...253
+           * dvecData3 : 2, 6,10,...126,130,134,138,...254
+           *
+           * Lower half of the vectors contain data from 1st input row and
+           * upper half of the vectors contain data from 2nd output row.
+           *
+           */
+          IVP_DSEL2NX8I(dvecIn12, dvecIn11, dvecIn12, dvecIn11, IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecIn22, dvecIn21, dvecIn22, dvecIn21, IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData3, dvecData1, \
+                        dvecIn21, dvecIn11, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData2 = IVP_SEL2NX8I(dvecIn22, dvecIn12, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+          /* Multiply and accumulate 1st set of 3 coefficients for all the outputs */
+          MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+          /* Load second row for 1st output row*/
+          vaData = MORPH_OP_PRIME_2Nx8(pdvecInp1);
+          MORPH_OP_LOAD_2Nx8(dvecIn11, vaData, pdvecInp1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecIn12, vaData, pdvecInp1, inDataPitch1 - (2 * XCHAL_IVPN_SIMD_WIDTH * flag));
+
+          /* Load second row for 2nd output row*/
+          vaData = MORPH_OP_PRIME_2Nx8(pdvecInp2);
+          MORPH_OP_LOAD_2Nx8(dvecIn21, vaData, pdvecInp2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecIn22, vaData, pdvecInp2, inDataPitch1 - (2 * XCHAL_IVPN_SIMD_WIDTH * flag));
+
+          IVP_DSEL2NX8I(dvecIn12, dvecIn11, dvecIn12, dvecIn11, IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecIn22, dvecIn21, dvecIn22, dvecIn21, IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData3, dvecData1, \
+                        dvecIn21, dvecIn11, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData2 = IVP_SEL2NX8I(dvecIn22, dvecIn12, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+          /* Multiply and accumulate 2nd set of 3 coefficients for all the outputs */
+          MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+          /* Load third row for 1st output row*/
+          vaData = MORPH_OP_PRIME_2Nx8(pdvecInp1);
+          MORPH_OP_LOAD_2Nx8(dvecIn11, vaData, pdvecInp1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecIn12, vaData, pdvecInp1, inDataPitch2 - 2 * inDataPitch1 - \
+                             (2 * XCHAL_IVPN_SIMD_WIDTH * flag));
+
+          /* Load third row for 2nd output row*/
+          vaData = MORPH_OP_PRIME_2Nx8(pdvecInp2);
+          MORPH_OP_LOAD_2Nx8(dvecIn21, vaData, pdvecInp2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecIn22, vaData, pdvecInp2, inDataPitch2 - 2 * inDataPitch1 - \
+                             (2 * XCHAL_IVPN_SIMD_WIDTH * flag));
+
+          IVP_DSEL2NX8I(dvecIn12, dvecIn11, dvecIn12, dvecIn11, IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecIn22, dvecIn21, dvecIn22, dvecIn21, IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData3, dvecData1, \
+                        dvecIn21, dvecIn11, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData2 = IVP_SEL2NX8I(dvecIn22, dvecIn12, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+          /* Multiply and accumulate 3rd set of 3 coefficients for all the outputs */
+          MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+        }   /* END for (inCh = 0; inCh < numInCh; inCh++) */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable store count */
+        varLen = XT_MIN(outW - x, vectorizationWidth);
+
+        /* Store the first row , first output depth */
+        pdvecOut  = (xb_vec2Nx8 *) (pOutput);
+        vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the first row , 2nd output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Get the 2nd output row elements which are in the upper half of output vectors */
+        dvecOut1L = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecOut1L, IVP_SELI_8B_EXTRACT_HI_HALVES);
+        dvecOut2L = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecOut2L, IVP_SELI_8B_EXTRACT_HI_HALVES);
+
+        /* Store the 2nd row , 1st output depth */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, (-typeFlag + 1) * varLen * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Store the 2nd row 32 outputs from 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, (-typeFlag + 1) * \
+                       varLen * enable2ndCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable2ndCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 2 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 2 * coeffPitch3;
+      } /* END for (outCh = 0; outCh < numOutCh; outCh += 2) */
+    }   /* END for (y = 0; y < outH; y += 2) */
+  }     /* END for (x = 0; x < outW; x += vectorizationWidth ) */
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolved(VQ)3D_S_3x3j1d2I8S8IX_MOW_WHD
+*  **************************************************************************/
+
+/********************************************************************************/
+/* Description : P6 optimized generic implementation for 3x3 3D convolution with*/
+/*               dilation = 2. Based on MORPH pre-processor specifiers, code    */
+/*               implementation is generated during preprocessing stage. This   */
+/*               method can be used to generate 3x3 3D dilated convolution      */
+/*               function and 3x3 3D VQ dilated convolution function for U8 bit */
+/*               and S8 bit input data with input stride equal to 1             */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                  */
+/*               Output scale array, CNN convolution params structure           */
+/* Outputs     : XI Error Code                                                  */
+/* InOuts      : Output Tile                                                    */
+/* Assumptions : CoeffData is S8                                                */
+/*               biasArray is signed 32b, value not exceeding signed 24b        */
+/*               Output scale array is U16                                      */
+/*               OutData is S8 / U8 / S16                                       */
+/*               Kernel Size is 3x3xDxN                                         */
+/*               Input and Output are in WHD format                             */
+/*               Coeff is in WHDN format                                        */
+/********************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_3x3j1d2_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_3x3j1d2_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_3x3j1d2_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_3x3j1d2_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_3x3j1d2), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 3);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_TILE3D_EDGE(inTile, 2);
+    XAI_CHECK_STRIDE(param, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                         \
+                    XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_DILATION(param, 2);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t dilationU     = XAI_CNN_CONV_GET_DILATION(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Since the dilation value > 1 ,                                      */
+  /* Effective Kernel size = dilation(KernelSize - 1) + 1                */
+  /* Effective kernel size is used for calculating the min required edge */
+  int32_t dilatedKSizeU = dilationU * (kSizeU - 1) + 1;
+
+  /* Move pointer to the start of the data (including edge)              */
+  pInData = &pInData[-((dilatedKSizeU / 2) * inDataPitch1 + (dilatedKSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 *restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+
+
+  /* In order to make the loop multiply-bound we are reducing the vectorization width
+     by extra values required for the kernel */
+  const int32_t vectorizationWidth = 4 * XCHAL_IVPN_SIMD_WIDTH - dilatedKSizeU + 1;
+
+  /* Generating the shuffle pattern for coefficient loads.
+     The idea is to populate zero value where the MUL4T should not affect
+     Pattern : 0 1 2 32 3 4 5 32 6 7 8 32 X X X .... */
+  xb_vec2Nx8 dvecIdx = IVP_SEL2NX8I(32, IVP_MOV2NX8_FROMNX16( \
+                                      IVP_MOVNX16T(IVP_ADDNX16(IVP_ANDNX16(IVP_SEQNX16(), 3),
+                                                               IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 2), 3)), 32,
+                                                   IVP_NEQNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), 3))),
+                                    IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+  /* Generating two select interleave pattern to apply on accumulator values just before storing
+   * For 8 bit output
+   *     Pattern1 = 0  64 1  65 2  66    ....  31 95
+   *     Pattern2 = 32 96 33 97 34 98  ...     63 127
+   * For 16 bit output
+   *     Pattern1 = 0  1  64 65  2 3  66 67 .... 30 31 94  95
+   *     Pattern2 = 32 33 96 97 34 35 98 99  ... 62 63 126 127
+   */
+  /* 0 1 2 3 .. 62 63*/
+  xb_vec2Nx8 dvecPattern1 = IVP_SEQ2NX8();
+  /* 65 66 67 ...126 127*/
+  xb_vec2Nx8 dvecPattern2 = IVP_ADD2NX8(dvecPattern1, 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+  if (!typeFlag)
+  {
+    MORPH_OP_DSELI(dvecPattern2, dvecPattern1, \
+                   dvecPattern2, dvecPattern1, \
+                   IVP_DSELI_8B_INTERLEAVE_1);
+  }
+  else
+  {
+    MORPH_OP_DSELI(dvecPattern2, dvecPattern1, \
+                   dvecPattern2, dvecPattern1, \
+                   IVP_DSELI_INTERLEAVE_1);
+  }
+
+
+  /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time
+   * into two vectors. Also loop across output channels is unrolled twice,
+   * thereby producing four output vectors in 1 iteration
+   */
+  for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */
+  {
+    /* variable length for output stores */
+    int32_t remX = XT_MIN(vectorizationWidth, outW - x);
+
+    /* If (remX + kSizeEffU - 1) <= 2 * XCHAL_IVPN_SIMD_WIDTH,
+     * i.e. if the number of input data bytes corresponding to remX number of outputs
+     * is less than or equal to 2 * XCHAL_IVPN_SIMD_WIDTH, there is no need to load
+     * the next 64 input bytes*/
+    int32_t remXLoad = ((remX + dilatedKSizeU - 1) > 2 * XCHAL_IVPN_SIMD_WIDTH) ? 1 : 0;
+
+    for (y = 0; y < outH; y++)    /* Loop across output height */
+    {
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+      /* initialize coeff and Bias data pointer */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 2)    /* Loop across Output depth */
+      {
+        /* In order to handle odd output depths */
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+        /* Load the bias values corresponding to two output channels */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+        dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+        dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        /* Input vector pointer initialization */
+        pdvecIn = (MORPH_IDT_2Nx8 *) (pInput);
+
+        /* Load 128 bytes from row corresponding to each ky
+         * dvecInData11 = a0 a1 a2 a3.... a63
+         * dvecInData12 = a64 a65 a66 .... a127
+         *
+         * Separate odd and even indices
+         * dvecInData11 = a0 a2 a4 a6.... a126
+         * dvecInData12 = a1 a3 a5 a7.... a127
+         *
+         * Let the coefficients be
+         * C0 C1 C2
+         * C3 C4 C5
+         * C6 C7 C8
+         *
+         * acc11 = [a0 a2 a4 a6.... a126] * C0 +
+         *         [a2 a4 a6.... a126 X ] * C1 +
+         *         [a4 a6.... a126 X  X ] * C2
+         *
+         * acc12 = [a1 a3 a5 a7.... a127] * C0 +
+         *         [a3 a5 a7.... a127 X ] * C1 +
+         *         [a5 a7.... a127 X  X ] * C2
+         * Continue the same multiplication steps for ky = 1 [C3 C4 C5] and ky =2 [C6 C7 C8].
+         * acc11 and acc12 contains convolved output corresponding to even and odd indices
+         * respectively at the end of inchannel loop iterations.
+         *
+         * acc11 and acc12 are interleaved to obtain the outputs in correct order.
+         *
+         */
+        for (inCh = 0; inCh < numInCh; inCh++)    /* Loop across input channels */
+        {
+          /* vectors for coeff and input loads */
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+          MORPH_IDT_2Nx8 dvecInData11, dvecInData12;
+          MORPH_IDT_2Nx8 dvecInData21, dvecInData22;
+          MORPH_IDT_2Nx8 dvecInData31, dvecInData32;
+
+          /* load data 128 bytes from first input row  */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \
+                             dilationU * inDataPitch1 - remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          /* Separate odd and even indices */
+          MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                         IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data 128 bytes from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \
+                             dilationU * inDataPitch1 - remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          /* Separate odd and even indices */
+          MORPH_OP_DSELI(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \
+                         IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data 128 bytes from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, inDataPitch2 - \
+                             dilationU * 2 * inDataPitch1 - remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          /* Separate odd and even indices */
+          MORPH_OP_DSELI(dvecInData32, dvecInData31, dvecInData32, dvecInData31, \
+                         IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load all the 3x3 coefficients for 2 output depths*/
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 9);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 9);
+
+          /* Rearrange them so that zero is inserted where the MUL4T should not have effect */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+
+          /* Multiply and accumulate 1st set of 3 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+          /* Multiply and accumulate 2nd set of 3 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, dvecInData21, dvecInData21, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc21, dvecInData21, dvecInData21, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+          MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+          /* Multiply and accumulate 3rd set of 3 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, dvecInData31, dvecInData31, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc12, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc21, dvecInData31, dvecInData31, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+          MORPH_OP_MUL4TA(dacc22, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+        }    /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvec1L, dvec2L, dvec3L, dvec4L;
+        xb_vec2Nx8 dvec1H, dvec2H, dvec3H, dvec4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Interleave odd and even indices */
+        xb_vec2Nx8 dvecOut1L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern1);
+        xb_vec2Nx8 dvecOut2L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern2);
+        xb_vec2Nx8 dvecOut1H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern1);
+        xb_vec2Nx8 dvecOut2H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern2);
+        xb_vec2Nx8 dvecOut3L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern1);
+        xb_vec2Nx8 dvecOut4L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern2);
+        xb_vec2Nx8 dvecOut3H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern1);
+        xb_vec2Nx8 dvecOut4H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern2);
+
+        /* Storing the first output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remX - \
+                       2 * XCHAL_IVPN_SIMD_WIDTH);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remX * enable2ndCh);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, \
+                       ((bytesPerPixel * remX) - 2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 2 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 2 * coeffPitch3;
+      }  /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+    }    /* end of for (y = 0; y < outH; y ++)*/
+  }      /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+
+
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiConvolved(VQ)3D_S_3x3j1d4I8S8IX_MOW_WHD
+*  **************************************************************************/
+
+/********************************************************************************/
+/* Description : P6 optimized generic implementation for 3x3 3D convolution with*/
+/*               dilation = 4. Based on MORPH pre-processor specifiers, code    */
+/*               implementation is generated during preprocessing stage. This   */
+/*               method can be used to generate 3x3 3D convolution dilated      */
+/*               function and 3x3 3D VQ convolution dilated function for U8 bit */
+/*               and S8 bit input data with input stride equal to 1             */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                  */
+/*               Output scale array, CNN convolution params structure           */
+/* Outputs     : XI Error Code                                                  */
+/* InOuts      : Output Tile                                                    */
+/* Assumptions : CoeffData is S8                                                */
+/*               biasArray is signed 32b, value not exceeding signed 24b        */
+/*               Output scale array is U16                                      */
+/*               OutData is S8 / U8 / S16                                       */
+/*               Kernel Size is 3x3xDxN                                         */
+/*               Input and Output are in WHD format                             */
+/*               Coeff is in WHDN format                                        */
+/********************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_3x3j1d4_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_3x3j1d4_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_3x3j1d4_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_3x3j1d4_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_3x3j1d4), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 3);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_TILE3D_EDGE(inTile, 4);
+    XAI_CHECK_STRIDE(param, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                         \
+                    XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_DILATION(param, 4);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t dilationU     = XAI_CNN_CONV_GET_DILATION(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Since the dilation value > 1 ,                                      */
+  /* Effective Kernel size = dilation(KernelSize - 1) + 1                */
+  /* Effective kernel size is used for calculating the min required edge */
+  int32_t dilatedKSizeU = dilationU * (kSizeU - 1) + 1;
+
+  /* Move pointer to the start of the data (including edge)              */
+  pInData = &pInData[-((dilatedKSizeU / 2) * inDataPitch1 + (dilatedKSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 *restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+
+
+  /* In order to make the loop multiply-bound we are reducing the vectorization width
+     by extra values required for the kernel */
+  const int32_t vectorizationWidth = 4 * XCHAL_IVPN_SIMD_WIDTH - dilatedKSizeU + 1;
+
+  /* Generating the shuffle pattern for coefficient loads.
+     The idea is to populate zero value where the MUL4T should not affect
+     Pattern : 0 1 2 32 3 4 5 32 6 7 8 32 X X X .... */
+  xb_vec2Nx8 dvecIdx = IVP_SEL2NX8I(32, IVP_MOV2NX8_FROMNX16( \
+                                      IVP_MOVNX16T(IVP_ADDNX16(IVP_ANDNX16(IVP_SEQNX16(), 3),
+                                                               IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 2), 3)), 32,
+                                                   IVP_NEQNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), 3))),
+                                    IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+
+  /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time
+   * into two vectors. Also loop across output channels is unrolled twice,
+   * thereby producing four output vectors in 1 iteration
+   */
+  for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */
+  {
+    /* variable length for output stores */
+    int32_t remX = XT_MIN(vectorizationWidth, outW - x);
+
+    /* If (remX + kSizeEffU - 1) <= 2 * XCHAL_IVPN_SIMD_WIDTH,
+     * i.e. if the number of input data bytes corresponding to remX number of outputs
+     * is less than or equal to 2 * XCHAL_IVPN_SIMD_WIDTH, there is no need to load
+     * the next 64 input bytes*/
+    int32_t remXLoad = ((remX + dilatedKSizeU - 1) > 2 * XCHAL_IVPN_SIMD_WIDTH) ? 1 : 0;
+
+    for (y = 0; y < outH; y++)    /* Loop across output height */
+    {
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+      /* initialize coeff and Bias data pointer */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 2)    /* Loop across Output depth */
+      {
+        /* In order to handle odd output depths */
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+        /* Load the bias values corresponding to two output channels */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+        dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+        dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        /* Input vector pointer initialization */
+        pdvecIn = (MORPH_IDT_2Nx8 *) (pInput);
+
+        /* Load 128 bytes from row corresponding to each ky
+         * dvecInData11 = a0 a1 a2 a3      ...                   a63
+         * dvecInData12 = a64 a65 a66      ...                   a127
+         *
+         * Deinterleave the indices
+         * dvecInData11 = a0 a2 a4 a6      ...                   a126
+         * dvecInData12 = a1 a3 a5 a7      ...                   a127
+         *
+         * Deinterleave the indices
+         * dvecInData11 = a0 a4 a8   ...   a124 ... a1 a5  ...   a125
+         * dvecInData12 = a2 a6 a10   ...  a126 ... a3 a7  ...   a127
+         *
+         * Let the coefficients be
+         * C0 C1 C2
+         * C3 C4 C5
+         * C6 C7 C8
+         *
+         * acc11 = [a0 a4 a8   ...   a124 ... a1 a5  ...   a125] * C0 +
+         *         [a4 a8   ...   a124 ... a1 a5  ...   a125 X ] * C1 +
+         *         [a8   ...   a124 ... a1 a5  ...   a125 X  X ] * C2
+         *
+         * acc12 = [a2 a6 a10   ...  a126 ... a3 a7  ...   a127] * C0 +
+         *         [a6 a10   ...  a126 ... a3 a7  ...   a127 X ] * C1 +
+         *         [a10   ...  a126 ... a3 a7  ...   a127 X  X ] * C2
+         * Continue the same multiplication steps for ky = 1 [C3 C4 C5] and ky =2 [C6 C7 C8].
+         * acc11 and acc12 contains convolved output corresponding to even and odd indices
+         * respectively at the end of inchannel loop iterations.
+         *
+         * acc11 and acc12 are interleaved to obtain the outputs in correct order.
+         *
+         * Follow same steps for obtaining outputs corresponding to next output channel
+         */
+        for (inCh = 0; inCh < numInCh; inCh++)    /* Loop across input channels */
+        {
+          /* vectors for coeff and input loads */
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+          MORPH_IDT_2Nx8 dvecInData11, dvecInData12;
+          MORPH_IDT_2Nx8 dvecInData21, dvecInData22;
+          MORPH_IDT_2Nx8 dvecInData31, dvecInData32;
+
+          /* load data 128 bytes from first input row  */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \
+                             dilationU * inDataPitch1 - remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          /* Separate odd and even indices */
+          MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                         IVP_DSELI_8B_DEINTERLEAVE_1);
+          MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                         IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data 128 bytes from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \
+                             dilationU * inDataPitch1 - remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          /* Separate odd and even indices */
+          MORPH_OP_DSELI(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \
+                         IVP_DSELI_8B_DEINTERLEAVE_1);
+          MORPH_OP_DSELI(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \
+                         IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data 128 bytes from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, inDataPitch2 - \
+                             dilationU * 2 * inDataPitch1 - remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          /* Separate odd and even indices */
+          MORPH_OP_DSELI(dvecInData32, dvecInData31, dvecInData32, dvecInData31, \
+                         IVP_DSELI_8B_DEINTERLEAVE_1);
+          MORPH_OP_DSELI(dvecInData32, dvecInData31, dvecInData32, dvecInData31, \
+                         IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load all the 3x3 coefficients for 2 output depths*/
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 9);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 9);
+
+          /* Rearrange them so that zero is inserted where the MUL4T should not have effect */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+
+          /* Multiply and accumulate 1st set of 3 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+          /* Multiply and accumulate 2nd set of 3 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, dvecInData21, dvecInData21, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc21, dvecInData21, dvecInData21, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+          MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+          /* Multiply and accumulate 3rd set of 3 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, dvecInData31, dvecInData31, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc12, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc21, dvecInData31, dvecInData31, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+          MORPH_OP_MUL4TA(dacc22, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+        }    /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Interleave odd and even indices */
+        /* 8 bit output */
+        if (!typeFlag)
+        {
+          /*
+           * dacc11 and dacc12 contains accumulated values corresponding to same output row.
+           * For 8bit output, dvecOutL contains the required output elements
+           * dvecOut1L = [A0 A4 A8 ... A116 X X A1 A5 ... A117 X X] - 64 elements
+           * dvecOut2L = [A2 A6 A10 ...A118 X X A3 A7 ... A119 X X] - 64 elements
+           * Interleave the elements
+           * dvecOut1L = [A0 A2 A4       ...    A116 A117 X X X X ] - 64 elements
+           * dvecOut2L = [A1 A3 A7       ...    A118 A119 X X X X ] - 64 elements
+           * Interleave the elements
+           * dvecOut1L = [A0 A1 A2 A3                   ...                 ]- 64 elements
+           * dvecOut2L = [  ...         A116 A117 A118 A119 X X X X X X X X ]- 64 elements
+           *
+           * Same steps for ouputs corresponding to second output channel.
+           */
+          MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                         IVP_DSELI_8B_INTERLEAVE_1);
+          MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                         IVP_DSELI_8B_INTERLEAVE_1);
+          MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                         IVP_DSELI_8B_INTERLEAVE_1);
+          MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                         IVP_DSELI_8B_INTERLEAVE_1);
+        }
+        else /* 16bit output */
+        {
+          /*
+           * dacc11 and dacc12 contains accumulated values corresponding to same output row.
+           * dvecOut1L = [A0 A4 A8  ... A116 X X] - 32 16b elements
+           * dvecOut1H = [A1 A5 A9  ... A117 X X] - 32 16b elements
+           * dvecOut2L = [A2 A6 A10 ... A118 X X] - 32 16b elements
+           * dvecOut2H = [A3 A7 A11 ... A119 X X] - 32 16b elements
+           * Interleave the elements of dvecOut1L and dvecOut1H
+           * dvecOut1L = [A0 A1 A4 A5         ...      ]
+           * dvecOut1H = [ ...            A116 A117 X X]
+           * Interleave the elements of dvecOut2L and dvecOut2H
+           * dvecOut2L = [A2 A3 A6 A7               ...]
+           * dvecOut2H = [ ...            A118 A119 X X]
+           * Interleave2 the elements of dvecOut2L and dvecOut1L
+           * dvecOut1L = [A0  A1  A2  A3                        ...          ]
+           * dvecOut2L = [A32 A33 A34 A35                        ...         ]
+           * Interleave2 the elements of dvecOut2H and dvecOut1H
+           * dvecOut1H = [A64 A65 A66 A67                  ...               ]
+           * dvecOut2H = [ ...            A116 A117 A118 A119 X X X X X X X X]
+           *
+           * Same steps for outputs corresponding to second output channel.
+           */
+          MORPH_OP_DSELI(dvecOut1H, dvecOut1L, dvecOut1H, dvecOut1L, \
+                         IVP_DSELI_INTERLEAVE_1);
+          MORPH_OP_DSELI(dvecOut2H, dvecOut2L, dvecOut2H, dvecOut2L, \
+                         IVP_DSELI_INTERLEAVE_1);
+          MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                         IVP_DSELI_INTERLEAVE_2);
+          MORPH_OP_DSELI(dvecOut2H, dvecOut1H, dvecOut2H, dvecOut1H, \
+                         IVP_DSELI_INTERLEAVE_2);
+          MORPH_OP_DSELI(dvecOut3H, dvecOut3L, dvecOut3H, dvecOut3L, \
+                         IVP_DSELI_INTERLEAVE_1);
+          MORPH_OP_DSELI(dvecOut4H, dvecOut4L, dvecOut4H, dvecOut4L, \
+                         IVP_DSELI_INTERLEAVE_1);
+          MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                         IVP_DSELI_INTERLEAVE_2);
+          MORPH_OP_DSELI(dvecOut4H, dvecOut3H, dvecOut4H, dvecOut3H, \
+                         IVP_DSELI_INTERLEAVE_2);
+        }
+
+
+        /* Storing the first output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remX - \
+                       2 * XCHAL_IVPN_SIMD_WIDTH);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remX * enable2ndCh);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * remX) - \
+                                                        2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 2 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 2 * coeffPitch3;
+      }  /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+    }    /* end of for (y = 0; y < outH; y ++)*/
+  }      /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+
+
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+* MOW fold 16 Stride 1                                                                    *
+* If inDataPitch1 is lesser than or equal to                                              *
+* 16 this function is called.                                                             *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_4x4j1d1), S8IX_MOW_WHD_FOLD16) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag  = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  int32_t leftEdge, topEdge;
+
+  leftEdge = leftEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1);
+  topEdge  = topEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1);
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 *restrict pdvecIn1;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, y;
+
+  /* Select sequence to re-arrange input data */
+  xb_vec2Nx8 dvecSeq = 0;
+  IVP_ADD2NX8T(dvecSeq, IVP_SEQ2NX8(), inDataPitch1, IVP_LT2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1));
+  IVP_ADD2NX8T(dvecSeq, IVP_SUB2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1), 64, \
+               IVP_NOTB2N(IVP_LT2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1)));
+
+  /* loop across output channels are unrolled twice and 4 rows are accessed simultaneously
+   * to produce four output vectors in 1 iteration
+   */
+  for (y = 0; y < outH - 3; y += 4)   /* Loop across output height */
+  {
+    /* initialize output data pointer */
+    int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel];
+
+    /* initialize coeff and Bias data pointer */
+    int8_t *pCoeff = &pCoeffData[0];
+    int32_t *pBias = &pBiasData[0];
+
+    for (outCh = 0; outCh < numOutCh; outCh += 2)    /* Loop across Output depth */
+    {
+      /* In order to handle odd output depths */
+      int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+      /* Load the bias values corresponding to two output channels */
+      xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+      xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+      /* wide vectors(accumulators) initialized with bias */
+      xb_vec2Nx24 dacc1, dacc2;
+
+      dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+      IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+
+      dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+      IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+
+      /* priming of coeff load is done outside the innermost loop*/
+      pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+      valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+      pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+      valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y];
+
+      for (inCh = 0; inCh < numInCh; inCh++)    /* Loop across input channels */
+      {
+        /* vectors for coeff and input loads */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+        MORPH_IDT_2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, dvecInData5;
+
+        /* Initialize input data pointer */
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+        pInput  += inDataPitch2;
+
+        /* load data from 4 input rows [Row0 | Row1 | Row2 | Row3] */
+        valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, 4 * inDataPitch1);
+
+        /* load data from next 4 input rows [Row4 | Row5 | Row6 | Row7] */
+        MORPH_OP_LOAD_2Nx8(dvecInData5, vaInData1, pdvecIn1, 4 * inDataPitch1);
+
+        /* dvecInData1 contains first 4 input rows
+         * dvecInData1: row0 | row1 | row2 | row3
+         *
+         * dvecInData5 contains next 4 input rows
+         * dvecInData5: row4 | row5 | row6 | row7
+         *
+         * Input data is re arranged in such a manner that
+         * dvecInData2 contains: row1 | row2 | row3 | row4
+         * dvecInData3 contains: row2 | row3 | row4 | row5
+         * dvecInData4 contains: row3 | row4 | row5 | row6
+         */
+
+        /*Compute row [Row1 | Row2 | Row3 | Row4] */
+        dvecInData2 = IVP_SEL2NX8(dvecInData5, dvecInData1, dvecSeq);
+
+        /*Compute row [Row2 | Row3 | Row4 | Row5] */
+        dvecInData3 = IVP_SEL2NX8(IVP_SEL2NX8(0, dvecInData5, dvecSeq), dvecInData2, dvecSeq);
+
+        /*Compute row [Row3 | Row4 | Row5 | Row6] */
+        dvecInData4 = IVP_SEL2NX8(IVP_SEL2NX8(0, dvecInData5, IVP_ADD2NX8(dvecSeq, inDataPitch1)), dvecInData3, dvecSeq);
+
+        /* load all the 4x4 coefficients for 2 output depths*/
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 16);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 16);
+
+        /* Compute the output of 4 output rows, for the 1st output depth */
+        MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc1, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+        MORPH_OP_MUL4TA(dacc1, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+        MORPH_OP_MUL4TA(dacc1, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+
+        /* Compute the output of 4 output rows, for the 2nd output depth */
+        MORPH_OP_MUL4TA(dacc2, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+        MORPH_OP_MUL4TA(dacc2, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+        MORPH_OP_MUL4TA(dacc2, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+        MORPH_OP_MUL4TA(dacc2, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+      }    /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+      /* Pack, Output Scale, Output Shift and clamping */
+      xb_vec2Nx8 dvecOut1L, dvecOut2L;
+      xb_vec2Nx8 dvecOut1H, dvecOut2H;
+
+#if DILATED_VQ_CONV == VQ_TRUE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    pOutScaleData[outCh + enable2ndCh ], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+      /* Storing the first output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput);
+      valign vaOutData = IVP_ZALIGN();
+      IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, third row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch1 * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, fourth row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch1 * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+      IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            outDataPitch1) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)),
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh \
+                     * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            2 * outDataPitch1) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)),
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh \
+                     * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            3 * outDataPitch1) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)),
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh \
+                     * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      pOutput += 2 * outDataPitch2 * bytesPerPixel;
+      pCoeff  += 2 * coeffPitch3;
+    }  /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+  }    /* end of for (y = 0; y < outH; y += 2)*/
+
+  if (y < outH) /* Handle the case when less than 4 output rows need to be processed */
+  {
+    int32_t enable2ndRow = XT_SALT(y, outH - 1);
+    int32_t enable3rdRow = XT_SALT(y, outH - 2);
+
+    /* initialize output data pointer */
+    int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel];
+
+    /* initialize coeff and Bias data pointer */
+    int8_t *pCoeff = &pCoeffData[0];
+    int32_t *pBias = &pBiasData[0];
+
+    for (outCh = 0; outCh < numOutCh; outCh += 2)    /* Loop across Output depth */
+    {
+      /* In order to handle odd output depths and heights */
+      int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+      /* Load the bias values corresponding to two output channels */
+      xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+      xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+      /* wide vectors(accumulators) initialized with bias */
+      xb_vec2Nx24 dacc1, dacc2;
+
+      dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+      IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+
+      dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+      IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+
+      /* priming of coeff load is done outside the innermost loop*/
+      pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+      valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+      pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+      valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y];
+
+      for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */
+      {
+        /* vectors for coeff and input loads */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+        MORPH_IDT_2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, dvecInData5;
+
+        /* load the remaining input rows */
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+
+        valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+
+        MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, 4 * inDataPitch1);
+        MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData5, vaInData1, pdvecIn1, (enable3rdRow + enable2ndRow) * inDataPitch1);
+        pInput += inDataPitch2;
+
+        /* load all the 4x4 coefficients for 2 output depths*/
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 16);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 16);
+
+
+        /* dvecInData1 contains first 4 input rows and
+         * dvecInData1: row0 | row1 | row2 | row3
+         * dvecInData5: row4 | row5 | row6 | row7
+         * row6 and row7 of dvecInData5 are always disabled.
+         *
+         * Input data is re arranged in such a manner that
+         * dvecInData2 contains: row1 | row2 | row3 | row4
+         * dvecInData3 contains: row2 | row3 | row4 | row5
+         * dvecInData4 contains: row3 | row4 | row5 | row6
+         */
+        /*Compute row [Row1 | Row2 | Row3 | Row4] */
+        dvecInData2 = IVP_SEL2NX8(dvecInData5, dvecInData1, dvecSeq);
+
+        /*Compute row [Row2 | Row3 | Row4 | Row5] */
+        dvecInData3 = IVP_SEL2NX8(IVP_SEL2NX8(0, dvecInData5, dvecSeq), dvecInData2, dvecSeq);
+
+        /*Compute row [Row3 | Row4 | Row5 | Row6] */
+        dvecInData4 = IVP_SEL2NX8(IVP_SEL2NX8(0, dvecInData5, IVP_ADD2NX8(dvecSeq, inDataPitch1)), dvecInData3, dvecSeq);
+
+
+        /* Multiply input data with coefficients from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc1, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+        MORPH_OP_MUL4TA(dacc1, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+        MORPH_OP_MUL4TA(dacc1, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+
+        /* Multiply input data with coefficients from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+        MORPH_OP_MUL4TA(dacc2, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+        MORPH_OP_MUL4TA(dacc2, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+        MORPH_OP_MUL4TA(dacc2, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+      }    /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+      /* Pack, Output Scale, Output Shift and clamping */
+      xb_vec2Nx8 dvecOut1L, dvecOut2L;
+      xb_vec2Nx8 dvecOut1H, dvecOut2H;
+#if DILATED_VQ_CONV == VQ_TRUE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+      /* Storing the first output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput);
+      valign vaOutData = IVP_ZALIGN();
+      IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, 3rd row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch1 * enable3rdRow * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable3rdRow * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+      IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            outDataPitch1 * enable2ndRow) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * enable2ndRow * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, 3rd row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            2 * outDataPitch1 * enable3rdRow) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * enable3rdRow * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      pOutput += 2 * outDataPitch2 * bytesPerPixel;
+      pCoeff  += 2 * coeffPitch3;
+    } /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+  }   //if(y < outH)
+}
+
+/******************************************************************************************
+* MOW fold 32 Stride 1                                                                    *
+* If inDataPitch1 is lesser than or equal to                                              *
+* 32 this function is called.                                                             *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_4x4j1d1), S8IX_MOW_WHD_FOLD32) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag  = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  int32_t leftEdge, topEdge;
+
+  leftEdge = leftEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1);
+  topEdge  = topEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1);
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 *restrict pdvecIn1;
+  MORPH_IDT_2Nx8 *restrict pdvecIn2;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, y;
+  int32_t inDataPitch1_2X = 2 * inDataPitch1;
+
+  /* loop across output channels and output height are unrolled twice
+   * to produce four output vectors in 1 iteration
+   */
+  for (y = 0; y < outH; y += 2)   /* Loop across output height */
+  {
+    /* in order to hanlde odd output height */
+    int32_t enable2Row = XT_SALT(y, outH - 1);
+    /* initialize output data pointer */
+    int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel];
+
+    /* initialize coeff and Bias data pointer */
+    int8_t *pCoeff = &pCoeffData[0];
+    int32_t *pBias = &pBiasData[0];
+
+    for (outCh = 0; outCh < numOutCh; outCh += 2)    /* Loop across Output depth */
+    {
+      /* In order to handle odd output depths */
+      int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+      /* Load the bias values corresponding to two output channels */
+      xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+      xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+      /* wide vectors(accumulators) initialized with bias */
+      xb_vec2Nx24 dacc1, dacc2;
+
+      dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+      IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+
+      dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+      IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+
+      /* priming of coeff load is done outside the innermost loop*/
+      pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+      valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+      pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+      valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y];
+
+      for (inCh = 0; inCh < numInCh; inCh++)    /* Loop across input channels */
+      {
+        /* vectors for coeff and input loads */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+        MORPH_IDT_2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4;
+
+        /* load data from first 2 input rows */
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+        pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1);
+        pInput  += inDataPitch2;
+
+        /* load data from 2 input rows [Row0 | Row1] */
+        valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, inDataPitch1_2X);
+
+        /* load data from 2 input rows [Row1 | Row2] */
+        valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData2, pdvecIn2, inDataPitch1_2X);
+
+        /* load data from next 2 input rows [Row2 | Row3] */
+        MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData1, pdvecIn1, inDataPitch1_2X);
+
+        /* load data from next 2 input rows [ex: Row3 | Row4] */
+        MORPH_OP_LOAD_2Nx8(dvecInData4, vaInData2, pdvecIn2, inDataPitch1_2X);
+
+        /* load all the 4x4 coefficients for 2 output depths*/
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 16);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 16);
+
+        /* 4 vector loads are used to load 5 rows of input. Two output channels are
+           processed at a time. */
+        MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc1, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+        MORPH_OP_MUL4TA(dacc1, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+        MORPH_OP_MUL4TA(dacc1, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+
+        MORPH_OP_MUL4TA(dacc2, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+        MORPH_OP_MUL4TA(dacc2, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+        MORPH_OP_MUL4TA(dacc2, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+        MORPH_OP_MUL4TA(dacc2, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+      }    /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+      /* Pack, Output Scale, Output Shift and clamping */
+      xb_vec2Nx8 dvecOut1L, dvecOut2L;
+      xb_vec2Nx8 dvecOut1H, dvecOut2H;
+#if DILATED_VQ_CONV == VQ_TRUE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    pOutScaleData[outCh + enable2ndCh ], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+      /* Storing the first output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput);
+      valign vaOutData = IVP_ZALIGN();
+      IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2Row * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+      IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                            outDataPitch1 * enable2Row) * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)),
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \
+                     enable2Row * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      pOutput += 2 * outDataPitch2 * bytesPerPixel;
+      pCoeff  += 2 * coeffPitch3;
+    }  /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+  }    /* end of for (y = 0; y < outH; y += 2)*/
+}
+
+/*****************************************************************************
+*  xaiConvolved(VQ)3D_S_4x4j1d1I8S8IX_MOW_WHD
+*  **************************************************************************/
+/********************************************************************************/
+/* Description : P6 optimized generic implementation for 4x4 3D convolution with*/
+/*               dilation = 1. Based on MORPH pre-processor specifiers, code    */
+/*               implementation is generated during preprocessing stage. This   */
+/*               method can be used to generate 4x4 3D convolution dilated      */
+/*               function and 4x4 3D VQ convolution dilated function for U8 bit */
+/*               and S8 bit input data with input stride equal to 1             */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                  */
+/*               Output scale array, CNN convolution params structure           */
+/* Outputs     : XI Error Code                                                  */
+/* InOuts      : Output Tile                                                    */
+/* Assumptions : CoeffData is S8                                                */
+/*               biasArray is signed 32b, value not exceeding signed 24b        */
+/*               Output scale array is U16                                      */
+/*               OutData is S8 / U8 / S16                                       */
+/*               Kernel Size is 4x4xDxN                                         */
+/*               Input and Output are in WHD format                             */
+/*               Coeff is in WHDN format                                        */
+/********************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_4x4j1d1_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_4x4j1d1_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_4x4j1d1_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_4x4j1d1_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_4x4j1d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 4);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                           \
+                    XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_STRIDE(param, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param);
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+  /* check inDataPitch1, if it is less than or equal to 16,
+   * call FOLD32 variant otherwise continue
+   */
+
+  if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 16)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_4x4j1d1), S8IX_MOW_WHD_FOLD16) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* check inDataPitch1, if it is less than or equal to 32,
+   * call FOLD32 variant otherwise continue
+   */
+
+  if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 32)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_4x4j1d1), S8IX_MOW_WHD_FOLD32) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag  = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  int32_t leftEdge, topEdge;
+
+  leftEdge = leftEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1);
+  topEdge  = topEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1);
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 *restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+  /* In order to make the loop multiply-bound we are reducing the vectorization width
+     by extra values required for the kernel */
+  const int32_t vectorizationWidth = ((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1;
+
+  /* loop across output channels and output height are unrolled twice
+   * to produce four output vectors in 1 iteration
+   */
+  for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */
+  {
+    for (y = 0; y < outH; y++)   /* Loop across output height */
+    {
+      /* in order to handle odd output height */
+
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+      /* initialize coeff and Bias data pointer */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 2)    /* Loop across Output depth */
+      {
+        /* In order to handle odd output depths */
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+        /* Load the bias values corresponding to two output channels */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc11, dacc21;
+
+        dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+
+        dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        /* Input vector pointer initialization */
+        pdvecIn = (MORPH_IDT_2Nx8 *) (pInput);
+
+        for (inCh = 0; inCh < numInCh; inCh++)    /* Loop across input channels */
+        {
+          /* vectors for coeff and input loads */
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+          MORPH_IDT_2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4;
+
+          /* load data from first input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, inDataPitch1);
+
+          /* load data from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn, inDataPitch1);
+
+          /* load data from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn, inDataPitch1);
+
+          /* load data from 4th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData4, vaInData, pdvecIn, inDataPitch2 - (3 * inDataPitch1));
+
+          /* load all the 4x4 coefficients for 2 output depths*/
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 16);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 16);
+
+          /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          MORPH_OP_MUL4TA(dacc21, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc21, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+          /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+          MORPH_OP_MUL4TA(dacc21, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+
+          /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+
+          MORPH_OP_MUL4TA(dacc21, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+        }    /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc21, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc21, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable length for output stores */
+        int32_t varLen = XT_MIN(vectorizationWidth, outW - x);
+
+        /* Storing the first output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * enable2ndCh * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 2 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 2 * coeffPitch3;
+      }  /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+    }    /* end of for (y = 0; y < outH; y += 2)*/
+  }      /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+* 5x5 MOW WHD Stride 1 - DEPTH 3                                                          *
+* If number of input channels is equal to 3                                               *
+* this function is called.                                                                *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_5x5j1d1), S8IX_MOW_WHD_DEPTH3) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitch of Coefficient Data (WHDN) in dim3 */
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN) */
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8* restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff;
+
+  /* Variable Declarations */
+  int32_t outCh, x, y;
+  int32_t varLen;
+
+  /* Vectorization width is 124 */
+  const int32_t vectorizationWidth = ((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1;
+
+  /* The number of inchannels is 3. In the implementation 3 channels of
+   * coefficient tile is loaded into two vectors and select operation
+   * the values are arranged so that quad muls can be used.
+   * First load(dveccoeff1) is used to load two channels of 5x5 coefficient
+   * Second load(dveccoeff2) to load the third channel.
+   * In dveccoeff1, 0 to 24 indices corresponds to channel 1 and 25 to 49
+   * corresponds to channel 2.
+   * Select pattern :
+   * Pattern 1 :
+   *   0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18 ,20,21,22,23 ,
+   *   25,26,27,28, 30,31,32,33, 35,36,37,38, 40,41,42,43, 45,46,47,48
+   *
+   * Pattern 2 :
+   *   0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18,  20,21,22,23 ,
+   *   4,9,14,19, (4,9,14,19)+64, (4,9,14,19)+64+25, 24, 24+64, 24+64+25
+   */
+
+
+
+  xb_vec2Nx8 dvecPattern1, dvecPattern2, dvecTempPattern, dvecSelPattern;
+  /*Pattern1 : 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18 ,20,21,22,23 ,
+              25,26,27,28, 30,31,32,33, 35,36,37,38, 40,41,42,43, 45,46,47,48 */
+  dvecPattern1 = IVP_ADD2NX8(IVP_SEQ2NX8(), IVP_SRLI2NX8(IVP_SEQ2NX8(), 2));
+
+  /*Pattern2 : 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18,  20,21,22,23 ,
+              4,9,14,19, (4,9,14,19)+64, (4,9,14,19)+64+25, 24, 24+64, 24+64+25 */
+
+  /* dvecTempPattern 4,9,14,19, (4,9,14,19)+64, (4,9,14,19)+64+25*/
+  dvecTempPattern = IVP_SLLI2NX8(IVP_ADD2NX8(IVP_AND2NX8(IVP_SEQ2NX8(), 3), 1), 2);
+  dvecTempPattern = IVP_ADD2NX8(dvecTempPattern, IVP_AND2NX8(IVP_SEQ2NX8(), 3));
+  IVP_ADD2NX8T(dvecTempPattern, dvecTempPattern, (2 * XCHAL_IVPN_SIMD_WIDTH), IVP_NOTB(IVP_LTR2N(4)));
+  IVP_ADD2NX8T(dvecTempPattern, dvecTempPattern, 25, IVP_NOTB(IVP_LTR2N(8)));
+  dvecSelPattern = IVP_SEQ2NX8();
+  IVP_ADD2NX8T(dvecSelPattern, dvecSelPattern, ((xb_vec2Nx8U) (2 * XCHAL_IVPN_SIMD_WIDTH - 20)), IVP_NOTB(IVP_LTR2N(20)));
+  dvecPattern2   = IVP_SEL2NX8(dvecTempPattern, dvecPattern1, dvecSelPattern);
+  dvecSelPattern = IVP_SEQ2NX8();
+  IVP_ADD2NX8T(dvecSelPattern, dvecSelPattern, ((xb_vec2Nx8U) (2 * XCHAL_IVPN_SIMD_WIDTH - 32)), IVP_NOTB(IVP_LTR2N(32)));
+  dvecPattern2 = IVP_SEL2NX8(IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32             \
+                                                    (24 + (88 << 8) + (113 << 16))), \
+                             dvecPattern2,
+                             dvecSelPattern);
+
+  /* loop across output height is unrolled twice and loops across inchannels,
+   * kernel width and kernel height are completely unrolled
+   */
+  for (x = 0; x < outW; x += vectorizationWidth)  /* Loop across output width */
+  {
+    varLen = XT_MIN(vectorizationWidth, outW - x);
+    /* In order to handle cases where input width <= 2*XCHAL_IVPN_SIMD_WIDTH, where
+     * the 2nd load from the same row needs to be avoided.  */
+    int32_t enable2ndCol = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, varLen + kSizeU - 1);
+
+    for (y = 0; y < outH; y += 2) /* Loop across output height */
+    {
+      /* In order to handle odd output height */
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+      /* initialize coeff and Bias data pointer*/
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh++)  /* Loop across Output depth */
+      {
+        /* load and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = dacc2 = dacc3 = dacc4 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc2, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc3, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc4, hvecBias1, hvecBias1);
+
+        /* Coefficient and Input data pointers */
+        pdvecCoeff = (xb_vec2Nx8 *) (pCoeff);
+        pdvecIn    = (MORPH_IDT_2Nx8 *) pInput;
+
+        /* vectors for coeff and input loads */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+        MORPH_IDT_2Nx8 dvecInData11, dvecInData21, dvecInData31, dvecInData41, dvecInData51;
+        MORPH_IDT_2Nx8 dvecInData61, dvecInData71, dvecInData81, dvecInData91, dvecInDataA1;
+        MORPH_IDT_2Nx8 dvecInData12, dvecInData22, dvecInData32, dvecInData42, dvecInData52;
+        MORPH_IDT_2Nx8 dvecInData62, dvecInData72, dvecInData82, dvecInData92, dvecInDataA2;
+
+        /* load 5x5 coefficients from three channels*/
+        valign vaCoeffData; vaCoeffData = IVP_LA2NX8_PP(pdvecCoeff);
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData, pdvecCoeff, 2 * coeffPitch2);
+        IVP_LA2NX8_IP(dvecCoeffData2, vaCoeffData, pdvecCoeff);
+
+        /* Rearrange them so that 3 x 4 MUL4T, 4 MULQ can be used to perform entire operation */
+        dvecCoeffData2 = IVP_SEL2NX8(dvecCoeffData1, dvecCoeffData2, dvecPattern2);
+        dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecPattern1);
+
+        /* Input Channel 1*/
+        /* load data from first input row */
+        valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 2nd input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 3rd input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 4th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 5th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \
+                           inDataPitch1 * enable2ndRow      \
+                           - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 6th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn,                 \
+                           inDataPitch2 - (4 + enable2ndRow) * inDataPitch1 \
+                           - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+
+        /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */
+        MORPH_OP_MUL4TA(dacc1, dvecInData12, dvecInData11, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc2, dvecInData12, dvecInData12, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc3, dvecInData22, dvecInData21, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc4, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+        /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */
+        MORPH_OP_MUL4TA(dacc1, dvecInData22, dvecInData21, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+        MORPH_OP_MUL4TA(dacc2, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+        MORPH_OP_MUL4TA(dacc3, dvecInData32, dvecInData31, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+        MORPH_OP_MUL4TA(dacc4, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */
+        MORPH_OP_MUL4TA(dacc1, dvecInData32, dvecInData31, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+        MORPH_OP_MUL4TA(dacc2, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+        MORPH_OP_MUL4TA(dacc3, dvecInData42, dvecInData41, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+        MORPH_OP_MUL4TA(dacc4, dvecInData42, dvecInData42, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+        /* Multiply and accumulate 4th set of 4 coefficients for all the outputs */
+        MORPH_OP_MUL4TA(dacc1, dvecInData42, dvecInData41, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+        MORPH_OP_MUL4TA(dacc2, dvecInData42, dvecInData42, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+        MORPH_OP_MUL4TA(dacc3, dvecInData52, dvecInData51, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+        MORPH_OP_MUL4TA(dacc4, dvecInData52, dvecInData52, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+
+        /* Multiply and accumulate 5th set of 4 coefficients for all the outputs */
+        MORPH_OP_MUL4TA(dacc1, dvecInData52, dvecInData51, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+        MORPH_OP_MUL4TA(dacc2, dvecInData52, dvecInData52, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+        MORPH_OP_MUL4TA(dacc3, dvecInData62, dvecInData61, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+        MORPH_OP_MUL4TA(dacc4, dvecInData62, dvecInData62, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+
+        /* Multiply and accumulate 6th set of 4 coefficients for all the outputs */
+        MORPH_OP_MULQA(dacc1, IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6));
+
+        MORPH_OP_MULQA(dacc2, IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData12, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6));
+
+        MORPH_OP_MULQA(dacc3, IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6));
+
+        MORPH_OP_MULQA(dacc4, IVP_SEL2NX8I(dvecInData52, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6));
+
+        /* Input data in channel 1, corresponding to 24th coefficient */
+        dvecInData71 = IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4);
+        dvecInData72 = IVP_SEL2NX8I(dvecInData52, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4);
+        dvecInData81 = IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4);
+        dvecInData82 = IVP_SEL2NX8I(dvecInData62, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+        /* Input Channel 2*/
+        /* load data from first input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 2nd input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 3rd input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 4th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 5th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \
+                           inDataPitch1 * enable2ndRow      \
+                           - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 6th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn,                 \
+                           inDataPitch2 - (4 + enable2ndRow) * inDataPitch1 \
+                           - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */
+        MORPH_OP_MUL4TA(dacc1, dvecInData12, dvecInData11, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+        MORPH_OP_MUL4TA(dacc2, dvecInData12, dvecInData12, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+        MORPH_OP_MUL4TA(dacc3, dvecInData22, dvecInData21, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+        MORPH_OP_MUL4TA(dacc4, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+        /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */
+        MORPH_OP_MUL4TA(dacc1, dvecInData22, dvecInData21, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6));
+        MORPH_OP_MUL4TA(dacc2, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6));
+        MORPH_OP_MUL4TA(dacc3, dvecInData32, dvecInData31, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6));
+        MORPH_OP_MUL4TA(dacc4, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6));
+
+        /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */
+        MORPH_OP_MUL4TA(dacc1, dvecInData32, dvecInData31, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7));
+        MORPH_OP_MUL4TA(dacc2, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7));
+        MORPH_OP_MUL4TA(dacc3, dvecInData42, dvecInData41, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7));
+        MORPH_OP_MUL4TA(dacc4, dvecInData42, dvecInData42, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7));
+
+        /* Multiply and accumulate 4th set of 4 coefficients for all the outputs */
+        MORPH_OP_MUL4TA(dacc1, dvecInData42, dvecInData41, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+        MORPH_OP_MUL4TA(dacc2, dvecInData42, dvecInData42, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+        MORPH_OP_MUL4TA(dacc3, dvecInData52, dvecInData51, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+        MORPH_OP_MUL4TA(dacc4, dvecInData52, dvecInData52, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+
+        /* Multiply and accumulate 5th set of 4 coefficients for all the outputs */
+        MORPH_OP_MUL4TA(dacc1, dvecInData52, dvecInData51, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9));
+        MORPH_OP_MUL4TA(dacc2, dvecInData52, dvecInData52, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9));
+        MORPH_OP_MUL4TA(dacc3, dvecInData62, dvecInData61, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9));
+        MORPH_OP_MUL4TA(dacc4, dvecInData62, dvecInData62, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9));
+
+        /* Multiply and accumulate 6th set of 4 coefficients for all the outputs */
+        MORPH_OP_MULQA(dacc1, IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7));
+
+        MORPH_OP_MULQA(dacc2, IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData12, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7));
+
+        MORPH_OP_MULQA(dacc3, IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7));
+
+        MORPH_OP_MULQA(dacc4, IVP_SEL2NX8I(dvecInData52, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7));
+
+        /* Input data in channel 2, corresponding to 24th coefficient */
+        dvecInData91 = IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4);
+        dvecInData92 = IVP_SEL2NX8I(dvecInData52, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4);
+        dvecInDataA1 = IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4);
+        dvecInDataA2 = IVP_SEL2NX8I(dvecInData62, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+        /* Input Channel 3*/
+        /* load data from first input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 2nd input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 3rd input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 4th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 5th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \
+                           inDataPitch1 * enable2ndRow      \
+                           - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 6th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn,                 \
+                           inDataPitch2 - (4 + enable2ndRow) * inDataPitch1 \
+                           - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */
+        MORPH_OP_MUL4TA(dacc1, dvecInData12, dvecInData11, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+        MORPH_OP_MUL4TA(dacc2, dvecInData12, dvecInData12, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+        MORPH_OP_MUL4TA(dacc3, dvecInData22, dvecInData21, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+        MORPH_OP_MUL4TA(dacc4, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+        /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */
+        MORPH_OP_MUL4TA(dacc1, dvecInData22, dvecInData21, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+        MORPH_OP_MUL4TA(dacc2, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+        MORPH_OP_MUL4TA(dacc3, dvecInData32, dvecInData31, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+        MORPH_OP_MUL4TA(dacc4, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+        /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */
+        MORPH_OP_MUL4TA(dacc1, dvecInData32, dvecInData31, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+        MORPH_OP_MUL4TA(dacc2, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+        MORPH_OP_MUL4TA(dacc3, dvecInData42, dvecInData41, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+        MORPH_OP_MUL4TA(dacc4, dvecInData42, dvecInData42, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+
+        /* Multiply and accumulate 4th set of 4 coefficients for all the outputs */
+        MORPH_OP_MUL4TA(dacc1, dvecInData42, dvecInData41, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+        MORPH_OP_MUL4TA(dacc2, dvecInData42, dvecInData42, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+        MORPH_OP_MUL4TA(dacc3, dvecInData52, dvecInData51, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+        MORPH_OP_MUL4TA(dacc4, dvecInData52, dvecInData52, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+
+        /* Multiply and accumulate 5th set of 4 coefficients for all the outputs */
+        MORPH_OP_MUL4TA(dacc1, dvecInData52, dvecInData51, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+        MORPH_OP_MUL4TA(dacc2, dvecInData52, dvecInData52, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+        MORPH_OP_MUL4TA(dacc3, dvecInData62, dvecInData61, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+        MORPH_OP_MUL4TA(dacc4, dvecInData62, dvecInData62, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+
+        /* Multiply and accumulate 6th set of 4 coefficients for all the outputs */
+        MORPH_OP_MULQA(dacc1, IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+
+        MORPH_OP_MULQA(dacc2, IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData12, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+
+        MORPH_OP_MULQA(dacc3, IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+
+        MORPH_OP_MULQA(dacc4, IVP_SEL2NX8I(dvecInData52, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+
+        /* Multiply and accumulate the data corresponding to 24th coefficient */
+        MORPH_OP_MULQA(dacc1, 0, dvecInData91, dvecInData71,
+                       IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+
+        MORPH_OP_MULQA(dacc2, 0, dvecInData92, dvecInData72,
+                       IVP_SEL2NX8I(dvecInData52, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+
+        MORPH_OP_MULQA(dacc3, 0, dvecInDataA1, dvecInData81,
+                       IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+
+        MORPH_OP_MULQA(dacc4, 0, dvecInDataA2, dvecInData82,
+                       IVP_SEL2NX8I(dvecInData62, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* 1st row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * \
+                       (varLen - 2 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - 3 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* 2nd row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * \
+                       (varLen - 2 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \
+                       2 * (varLen - 3 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += outDataPitch2 * bytesPerPixel;
+        pCoeff  += coeffPitch3;
+      } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/
+    }   /* end of for (y = 0; y < outH; y += 2)*/
+  }    /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+}
+
+
+/******************************************************************************************
+*   xaiConvolved(VQ)3D_S_5x5j1d1I8S8IX_MOW_WHD_FOLD16
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 5x5 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 5x5 3D dilated convolution function and 5x5 */
+/*               3D VQ dilated convolution function for U8 bit and S8 bit     */
+/*               input data with input stride equal to 1.                     */
+/*               If inDataPitch1 <= 16, this function variant is called.      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 5x5xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_5x5j1d1), S8IX_MOW_WHD_FOLD16) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitche of Coefficient Data (WHDN) in dim3 */
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN) */
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 * restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, y;
+
+  /* generates the sequence 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18
+   * , 20,21,22,23 ,4,9,14,19 ,24. To be used to shuffle the coeff data,
+   * So that last coeff from first 4 rows of coeffs can be used as one
+   * 32 byte element and make use of quad multiplier outside the inner-
+   * most loop.
+   * c11, c12, c13, c14, c15
+   * c21, c22, c23, c24, c25
+   * c31, c32, c33, c34, c35
+   * c41, c42, c43, c44, c45
+   * c51, c52, c53, c54, c55
+   *
+   * c15, c25, c35, c45 and c55 are placed in contiguous fashion, so that
+   * c15, c25, c35, and c45 can be used as one 32 byte element
+   */
+  xb_vec2Nx8 dvecIdx;
+  dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 15),
+                                      IVP_ADD2NX8U(IVP_SEQ2NX8(), 10),
+                                      IVP_SELI_INTERLEAVE_2_LO),
+                         IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 5), IVP_SEQ2NX8(),
+                                      IVP_SELI_INTERLEAVE_2_LO),
+                         IVP_SELI_INTERLEAVE_4_LO);
+
+  dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(
+                                        IVP_MOVNX16_FROMN_2X32(4 + (9 << 8) + (14 << 16) + \
+                                                               (19 << 24))),
+                                      IVP_ADD2NX8U(IVP_SEQ2NX8(), 20), IVP_SELI_INTERLEAVE_2_LO),
+                         dvecIdx, IVP_SELI_8B_PACK_16);
+
+  /* Select sequence to re-arrange input data */
+  xb_vec2Nx8 dvecSeq1 = 0;
+  IVP_ADD2NX8T(dvecSeq1, IVP_SEQ2NX8(), inDataPitch1, IVP_LT2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1));
+  IVP_ADD2NX8T(dvecSeq1, IVP_SUB2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1), 2 * XCHAL_IVPN_SIMD_WIDTH, \
+               IVP_NOTB2N(IVP_LT2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1)));
+
+  xb_vec2Nx8 dvecSeq2 = 0;
+  IVP_ADD2NX8T(dvecSeq2, IVP_SEQ2NX8(), 2 * inDataPitch1, \
+               IVP_LT2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1));
+  IVP_ADD2NX8T(dvecSeq2, IVP_SUB2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1), 2 * XCHAL_IVPN_SIMD_WIDTH, \
+               IVP_NOTB2N(IVP_LT2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1)));
+
+  xb_vec2Nx8 dvecSeq3 = 0;
+  IVP_ADD2NX8T(dvecSeq3, IVP_SEQ2NX8(), 3 * inDataPitch1, \
+               IVP_LT2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1));
+  IVP_ADD2NX8T(dvecSeq3, IVP_SUB2NX8(IVP_SEQ2NX8(), inDataPitch1), 2 * XCHAL_IVPN_SIMD_WIDTH, \
+               IVP_NOTB2N(IVP_LT2NX8(IVP_SEQ2NX8(), inDataPitch1)));
+
+
+  /* loop across output height is unrolled 4 times and
+   * loop across kernel width and height is completely unrolled
+   */
+  for (y = 0; y < outH; y += 4) /* Loop across output height */
+  {
+    /* In order to handle odd output height */
+    int32_t enable2ndRow = XT_SALT(y, outH - 1);
+    int32_t enable3rdRow = XT_SALT(y, outH - 2);
+    int32_t enable4thRow = XT_SALT(y, outH - 3);
+
+    /* initialize output data pointer */
+    int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel];
+
+    /* initialize coeff and Bias data pointer*/
+    int8_t *pCoeff = &pCoeffData[0];
+    int32_t *pBias = &pBiasData[0];
+
+    for (outCh = 0; outCh < numOutCh; outCh++)  /* Loop across Output depth */
+    {
+      /* load and replicate bias data */
+      xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4);
+
+      /* wide vectors(accumulators) initialized with bias */
+      xb_vec2Nx24 dacc1;
+      dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+      IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+
+      /* priming of coeff load is done outside the innermost loop*/
+      pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+      valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y];
+      pdvecIn = (MORPH_IDT_2Nx8 *) pInput;
+
+      for (inCh = 0; inCh < numInCh; inCh++)  /* Loop across input channels */
+      {
+        /* vectors for coeff and input loads */
+        xb_vec2Nx8 dvecCoeffData1;
+        xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, dvecInData5;
+
+        /* load data from 5 rows */
+        valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, inDataPitch1);
+        MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn, inDataPitch1);
+        MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn, inDataPitch1);
+        MORPH_OP_LOAD_2Nx8(dvecInData4, vaInData, pdvecIn, inDataPitch1);
+        MORPH_OP_LOAD_2Nx8(dvecInData5, vaInData, pdvecIn, inDataPitch2 - (4 * inDataPitch1));
+
+        /* dvecInData1: row0 | row1 | row2 | row3
+         * dvecInData2: row1 | row2 | row3 | row4
+         * dvecInData3: row2 | row3 | row4 | row5
+         * dvecInData4: row3 | row4 | row5 | row6
+         * dvecInData5: row4 | row5 | row6 | row7
+         */
+
+        /* load all the 5x5 coefficients */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+
+        /*Rearrange them so that 4 MUL4T, 1 MULQ & 1 MUL can be used to perform entire operation*/
+        dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+
+        /* Multiply and accumulate using 1st set of 4 coefficients */
+        MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+        /* Multiply and accumulate using 2nd set of 4 coefficients */
+        MORPH_OP_MUL4TA(dacc1, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* Multiply and accumulate using 3rd set of 4 coefficients */
+        MORPH_OP_MUL4TA(dacc1, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+        /* Multiply and accumulate using 4th set of 4 coefficients */
+        MORPH_OP_MUL4TA(dacc1, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+
+        /* Multiply and accumulate using 5th set of 4 coefficients */
+        MORPH_OP_MUL4TA(dacc1, dvecInData5, dvecInData5, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+
+        /* Multiply and accumulate using 6th set of 4 coefficients */
+        MORPH_OP_MULQA(dacc1, IVP_SEL2NX8I(dvecInData4, dvecInData4, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData3, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+        /* Multiply and accumulate using the final coefficient */
+        MORPH_OP_MULA(dacc1, IVP_SEL2NX8I(dvecInData5, dvecInData5, IVP_SELI_8B_ROTATE_RIGHT_4),
+                      IVP_EXTR2NX8(dvecCoeffData1, 24));
+      }  /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+      /* Pack, Output Scale, Output Shift and clamping */
+      xb_vec2Nx8 dvecOut1L, dvecOut1H;
+#if DILATED_VQ_CONV == VQ_TRUE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+      /* Storing the first row output */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput);
+      valign vaOutData = IVP_ZALIGN();
+      IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second row output */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L,                                      \
+                                 IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the third row output */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + enable3rdRow * 2 * outDataPitch1 * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L,                                          \
+                                 IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable3rdRow * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the fourth row output */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + enable4thRow * 3 * outDataPitch1 * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L,                                          \
+                                 IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable4thRow * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      pOutput += outDataPitch2 * bytesPerPixel;
+      pCoeff  += coeffPitch3;
+    } /* end of for (outCh = 0; outCh < numOutCh; outCh ++)*/
+  }   /* end of for (y = 0; y < outH; y += 4)*/
+}
+
+/******************************************************************************************
+*   xaiConvolved(VQ)3D_S_5x5j1d1I8S8IX_MOW_WHD_FOLD32
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 5x5 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 5x5 3D dilated convolution function and 5x5 */
+/*               3D VQ dilated convolution function for U8 bit and S8 bit     */
+/*               input data with input stride equal to 1.                     */
+/*               If inDataPitch1 <= 32, this function variant is called.      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 5x5xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_5x5j1d1), S8IX_MOW_WHD_FOLD32) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitche of Coefficient Data (WHDN) in dim3 */
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN) */
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 * restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, y;
+
+  /* generates the sequence 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18
+   * , 20,21,22,23 ,4,9,14,19 ,24. To be used to shuffle the coeff data,
+   * So that last coeff from first 4 rows of coeffs can be used as one
+   * 32 byte element and make use of quad multiplier outside the inner-
+   * most loop.
+   * c11, c12, c13, c14, c15
+   * c21, c22, c23, c24, c25
+   * c31, c32, c33, c34, c35
+   * c41, c42, c43, c44, c45
+   * c51, c52, c53, c54, c55
+   *
+   * c15, c25, c35, c45 and c55 are placed in contigous fashion, so that
+   * c15, c25, c35, and c45 can be used as one 32 byte element
+   */
+  xb_vec2Nx8 dvecIdx;
+  dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 15),
+                                      IVP_ADD2NX8U(IVP_SEQ2NX8(), 10),
+                                      IVP_SELI_INTERLEAVE_2_LO),
+                         IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 5), IVP_SEQ2NX8(),
+                                      IVP_SELI_INTERLEAVE_2_LO),
+                         IVP_SELI_INTERLEAVE_4_LO);
+
+  dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(
+                                        IVP_MOVNX16_FROMN_2X32(4 + (9 << 8) + (14 << 16) + \
+                                                               (19 << 24))),
+                                      IVP_ADD2NX8U(IVP_SEQ2NX8(), 20), IVP_SELI_INTERLEAVE_2_LO),
+                         dvecIdx, IVP_SELI_8B_PACK_16);
+
+
+  /* 3 Load operations are done t load the data    */
+  /* 1st vector load vec1 - row0 | row1            */
+  /* 2nd vector load vec3 - row2 | row3            */
+  /* 3rd vector load vec5 - row4 | row5            */
+  /* Select operation is used to get vec2 from     */
+  /* vec1 and vec3; and vec4 from vec3 and vec5.   */
+  /*                 vec2 - row1 | row2            */
+  /*                 vec4 - row3 | row4            */
+
+  xb_vec2Nx8 dvecSavSeq = IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel);
+  /* loop across output height is unrolled twice and loop across kernel width and height is
+   * completely unrolled
+   */
+  for (y = 0; y < outH; y += 2) /* Loop across output height */
+  {
+    /* In order to handle odd output height */
+    int32_t enable2ndRow = XT_SALT(y, outH - 1);
+    /* initialize output data pointer */
+    int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel];
+
+    /* initialize coeff and Bias data pointer*/
+    int8_t *pCoeff = &pCoeffData[0];
+    int32_t *pBias = &pBiasData[0];
+
+    for (outCh = 0; outCh < numOutCh; outCh++)  /* Loop across Output depth */
+    {
+      /* load and replicate bias data */
+      xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4);
+
+      /* wide vectors(accumulators) initialized with bias */
+      xb_vec2Nx24 dacc1;
+      dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+      IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+
+      /* priming of coeff load is done outside the innermost loop*/
+      pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+      valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y];
+      pdvecIn = (MORPH_IDT_2Nx8 *) pInput;
+
+      for (inCh = 0; inCh < numInCh; inCh++)  /* Loop across input channels */
+      {
+        /* vectors for coeff and input loads */
+        xb_vec2Nx8 dvecCoeffData1;
+        xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, dvecInData5;
+
+        /* load data from five input rows */
+        valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, inDataPitch1);
+        MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn, inDataPitch1);
+        MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn, inDataPitch1);
+        MORPH_OP_LOAD_2Nx8(dvecInData4, vaInData, pdvecIn, inDataPitch1);
+        MORPH_OP_LOAD_2Nx8(dvecInData5, vaInData, pdvecIn, inDataPitch2 - (4 * inDataPitch1));
+
+        /* load all the 5x5 coefficients */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+
+        /*Rearrange them so that 4 MUL4T, 1 MULQ & 1 MUL can be used to perform entire operation*/
+        dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+
+        /* Multiply and accumulate using 1st set of 4 coefficients */
+        MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+        /* Multiply and accumulate using 2nd set of 4 coefficients */
+        MORPH_OP_MUL4TA(dacc1, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* Multiply and accumulate using 3rd set of 4 coefficients */
+        MORPH_OP_MUL4TA(dacc1, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+        /* Multiply and accumulate using 4th set of 4 coefficients */
+        MORPH_OP_MUL4TA(dacc1, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+
+        /* Multiply and accumulate using 5th set of 4 coefficients */
+        MORPH_OP_MUL4TA(dacc1, dvecInData5, dvecInData5, IVP_EXTRN_2X32( \
+                          IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+
+        /* Multiply and accumulate using 6th set of 4 coefficients */
+        MORPH_OP_MULQA(dacc1, IVP_SEL2NX8I(dvecInData4, dvecInData4, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData3, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_4),
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+        /* Multiply and accumulate using the final coefficient */
+        MORPH_OP_MULA(dacc1, IVP_SEL2NX8I(dvecInData5, dvecInData5, IVP_SELI_8B_ROTATE_RIGHT_4),
+                      IVP_EXTR2NX8(dvecCoeffData1, 24));
+      }  /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+      /* Pack, Output Scale, Output Shift and clamping */
+      xb_vec2Nx8 dvecOut1L;
+      xb_vec2Nx8 dvecOut1H;
+#if DILATED_VQ_CONV == VQ_TRUE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+      /* Storing the first row output */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput);
+      valign vaOutData = IVP_ZALIGN();
+      IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the second row output */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, dvecSavSeq), vaOutData, pdvecOut, \
+                     bytesPerPixel * enable2ndRow * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      pOutput += outDataPitch2 * bytesPerPixel;
+      pCoeff  += coeffPitch3;
+    } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/
+  }   /* end of for (y = 0; y < outH; y += 2)*/
+}
+/******************************************************************************************
+*  xaiConvolved(VQ)3D_S_5x5j1d1I8S8IX_MOW_WHD
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 5x5 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 5x5 3D dilated convolution function and 5x5 */
+/*               3D VQ dilated convolution function for U8 bit and S8 bit     */
+/*               input data with input stride equal to 1                      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 5x5xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_5x5j1d1_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_5x5j1d1_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_5x5j1d1_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_5x5j1d1_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_5x5j1d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 5);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                           \
+                    XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE3D_EDGE(inTile, 2);
+    XAI_CHECK_STRIDE(param, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                   \
+                    XAI_ERR_NORM, "The accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                             \
+                    XAI_ERR_NORM, "The output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  if (XAI_TILE3D_GET_DIM3(inTile) == 3)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_5x5j1d1), S8IX_MOW_WHD_DEPTH3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+  if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 16)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_5x5j1d1), S8IX_MOW_WHD_FOLD16) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+  else if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 32)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_5x5j1d1), S8IX_MOW_WHD_FOLD32) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitche of Coefficient Data (WHDN) in dim3 */
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN) */
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+
+  const uint8_t outShiftU  = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 * restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+  int32_t varLen;
+
+  /* In order to make the loop multiply-bound we are reducing the vectorization width
+     by extra values required for the kernel */
+  const int32_t vectorizationWidth60  = ((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1;
+  const int32_t vectorizationWidth124 = ((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1;
+
+  /* generates the sequence 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18
+   * , 20,21,22,23 ,4,9,14,19 ,24. To be used to shuffle the coeff data,
+   * So that last coeff from first 4 rows of coeffs can be used as one
+   * 32 byte element and make use of quad multiplier outside the inner-
+   * most loop.
+   * c11, c12, c13, c14, c15
+   * c21, c22, c23, c24, c25
+   * c31, c32, c33, c34, c35
+   * c41, c42, c43, c44, c45
+   * c51, c52, c53, c54, c55
+   *
+   * c15, c25, c35, c45 and c55 are placed in contigous fashion, so that
+   * c15, c25, c35, and c45 can be used as one 32 byte element
+   */
+  xb_vec2Nx8 dvecIdx;
+  dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 15),
+                                      IVP_ADD2NX8U(IVP_SEQ2NX8(), 10),
+                                      IVP_SELI_INTERLEAVE_2_LO),
+                         IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 5), IVP_SEQ2NX8(),
+                                      IVP_SELI_INTERLEAVE_2_LO),
+                         IVP_SELI_INTERLEAVE_4_LO);
+
+  dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(
+                                        IVP_MOVNX16_FROMN_2X32(4 + (9 << 8) + (14 << 16) + \
+                                                               (19 << 24))),
+                                      IVP_ADD2NX8U(IVP_SEQ2NX8(), 20), IVP_SELI_INTERLEAVE_2_LO),
+                         dvecIdx, IVP_SELI_8B_PACK_16);
+
+  /* loop across output height is unrolled twice.
+   * Loop across kernel width and height is
+   * completely unrolled.
+   * 128 bytes of input are loaded.
+   */
+  for (x = 0; x < outW - vectorizationWidth60; x += vectorizationWidth124)  /* Loop across output width */
+  {
+    varLen = XT_MIN(vectorizationWidth124, outW - x);
+    /* In order to handle cases where input width <= 2*XCHAL_IVPN_SIMD_WIDTH, where
+     * the 2nd load from the same row needs to be avoided.  */
+    int32_t enable2ndCol = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, varLen + kSizeU - 1);
+
+    for (y = 0; y < outH; y += 2) /* Loop across output height */
+    {
+      /* In order to handle odd output height */
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+      /* initialize coeff and Bias data pointer*/
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+
+      for (outCh = 0; outCh < numOutCh; outCh++)  /* Loop across Output depth */
+      {
+        /* load and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = dacc2 = dacc3 = dacc4 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc2, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc3, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc4, hvecBias1, hvecBias1);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecIn = (MORPH_IDT_2Nx8 *) pInput;
+
+        for (inCh = 0; inCh < numInCh; inCh++)  /* Loop across input channels */
+        {
+          /* vectors for coeff and input loads */
+          xb_vec2Nx8 dvecCoeffData1;
+          xb_vec2Nx8 dvecInData11, dvecInData21, dvecInData31, dvecInData41, dvecInData51, dvecInData61;
+          xb_vec2Nx8 dvecInData12, dvecInData22, dvecInData32, dvecInData42, dvecInData52, dvecInData62;
+
+          /* load data from first input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, \
+                             enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \
+                             inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* load data from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, \
+                             enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \
+                             inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* load data from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, \
+                             enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \
+                             inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* load data from 4th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, \
+                             enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \
+                             inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* load data from 5th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, \
+                             enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \
+                             inDataPitch1 * enable2ndRow      \
+                             - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* load data from 6th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, \
+                             enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn,                 \
+                             inDataPitch2 - (4 + enable2ndRow) * inDataPitch1 \
+                             - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* load all the 5x5 coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+
+          /* Rearrange them so that 4 MUL4T, 1 MULQ and 1 MUL can be used to perform entire operation */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+
+          /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, dvecInData12, dvecInData11, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, dvecInData12, dvecInData12, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc3, dvecInData22, dvecInData21, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc4, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, dvecInData22, dvecInData21, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc3, dvecInData32, dvecInData31, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc4, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, dvecInData32, dvecInData31, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc2, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc3, dvecInData42, dvecInData41, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc4, dvecInData42, dvecInData42, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+          /* Multiply and accumulate 4th set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, dvecInData42, dvecInData41, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+          MORPH_OP_MUL4TA(dacc2, dvecInData42, dvecInData42, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+          MORPH_OP_MUL4TA(dacc3, dvecInData52, dvecInData51, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+          MORPH_OP_MUL4TA(dacc4, dvecInData52, dvecInData52, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+
+          /* Multiply and accumulate 5th set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, dvecInData52, dvecInData51, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+          MORPH_OP_MUL4TA(dacc2, dvecInData52, dvecInData52, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+          MORPH_OP_MUL4TA(dacc3, dvecInData62, dvecInData61, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+          MORPH_OP_MUL4TA(dacc4, dvecInData62, dvecInData62, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+
+          /* Multiply and accumulate 6th set of 4 coefficients for all the outputs */
+          MORPH_OP_MULQA(dacc1, IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+          MORPH_OP_MULQA(dacc2, IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData12, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+          MORPH_OP_MULQA(dacc3, IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+          MORPH_OP_MULQA(dacc4, IVP_SEL2NX8I(dvecInData52, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+          /* Multiply and accumulate the final coefficient for all the outputs */
+          MORPH_OP_MULA(dacc1, IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_EXTR2NX8(dvecCoeffData1, 24));
+          MORPH_OP_MULA(dacc2, IVP_SEL2NX8I(dvecInData52, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_EXTR2NX8(dvecCoeffData1, 24));
+          MORPH_OP_MULA(dacc3, IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_EXTR2NX8(dvecCoeffData1, 24));
+          MORPH_OP_MULA(dacc4, IVP_SEL2NX8I(dvecInData62, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_EXTR2NX8(dvecCoeffData1, 24));
+        }  /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+
+        /* 1st row  */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * (varLen - 2 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - 3 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* 2nd row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * \
+                       (varLen - 2 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \
+                       2 * (varLen - 3 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += outDataPitch2 * bytesPerPixel;
+        pCoeff  += coeffPitch3;
+      } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/
+    }   /* end of for (y = 0; y < outH; y += 2)*/
+  }    /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+
+  /* To handle cases where the remaining output width is less than or equal to 60.
+   * loop across output height is unrolled twice. Loop across kernel width and height is
+   * completely unrolled. 64 bytes of input are loaded.
+   */
+  if (x < outW)
+  {
+    for (y = 0; y < outH; y += 2) /* Loop across output height */
+    {
+      /* In order to handle odd output height */
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+      /* initialize coeff and Bias data pointer*/
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+
+      for (outCh = 0; outCh < numOutCh; outCh++)  /* Loop across Output depth */
+      {
+        /* load and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2;
+        dacc1 = dacc2 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc2, hvecBias1, hvecBias1);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecIn = (MORPH_IDT_2Nx8 *) pInput;
+
+        for (inCh = 0; inCh < numInCh; inCh++)  /* Loop across input channels */
+        {
+          /* vectors for coeff and input loads */
+          xb_vec2Nx8 dvecCoeffData1;
+          xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, dvecInData5, dvecInData6;
+
+          /* load data from first input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, inDataPitch1);
+
+          /* load data from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn, inDataPitch1);
+
+          /* load data from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn, inDataPitch1);
+
+          /* load data from 4th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData4, vaInData, pdvecIn, inDataPitch1);
+
+          /* load data from 5th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData5, vaInData, pdvecIn, inDataPitch1 * enable2ndRow);
+
+          /* load data from 6th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData6, vaInData, pdvecIn, inDataPitch2 - (4 + enable2ndRow) * inDataPitch1);
+
+          /* load all the 5x5 coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+
+          /* Rearrange them so that 4 MUL4T, 1 MULQ and 1 MUL can be used to perform entire operation */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+
+          /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc2, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+          /* Multiply and accumulate 4th set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+          MORPH_OP_MUL4TA(dacc2, dvecInData5, dvecInData5, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+
+          /* Multiply and accumulate 5th set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, dvecInData5, dvecInData5, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+          MORPH_OP_MUL4TA(dacc2, dvecInData6, dvecInData6, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+
+          /* Multiply and accumulate 6th set of 4 coefficients for all the outputs */
+          MORPH_OP_MULQA(dacc1, IVP_SEL2NX8I(dvecInData4, dvecInData4, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData3, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+          MORPH_OP_MULQA(dacc2, IVP_SEL2NX8I(dvecInData5, dvecInData5, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData4, dvecInData4, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData3, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+          /* Multiply and accumulate the final coefficient for all the outputs */
+          MORPH_OP_MULA(dacc1, IVP_SEL2NX8I(dvecInData5, dvecInData5, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_EXTR2NX8(dvecCoeffData1, 24));
+          MORPH_OP_MULA(dacc2, IVP_SEL2NX8I(dvecInData6, dvecInData6, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_EXTR2NX8(dvecCoeffData1, 24));
+        }  /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable length for output stores */
+        varLen = XT_MIN(vectorizationWidth60, outW - x);
+
+        /* Storing the first depth output */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* first depth , 2nd row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += outDataPitch2 * bytesPerPixel;
+        pCoeff  += coeffPitch3;
+      } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/
+    }   /* end of for (y = 0; y < outH; y += 2)*/
+  }     /* end of if( x < outW)*/
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+* 5x5 MOW WHD Stride 2 - DEPTH 3                                                          *
+* If number of input channels is equal to 3                                               *
+* this function is called.                                                                *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_5x5j2d1), S8IX_MOW_WHD_DEPTH3) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8* restrict pdvecIn1;
+  MORPH_IDT_2Nx8* restrict pdvecIn2;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+  xb_vec2Nx8* restrict pdvecCoeff3;
+  xb_vec2Nx8* restrict pdvecCoeff4;
+
+  /* Variable Declarations */
+  int32_t outCh, x, y;
+  /* In order to make the loop multiply-bound we are reducing the vectorization width
+     by extra values required for the kernel */
+  const int32_t vectorizationWidth = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1;
+
+  /* Since there are 25 coefficients for 1 output channel, we can make use of 6 quad multipliers
+   * for generating 1 output. So we need to re-arrange the 25 coefficients in the pattern shown
+   * Pattern : 0 1 2 3 5 6 7 8 10 11 12 13 15 16 17 18 20 21 22 23 4 9 14 19 24 */
+  xb_vec2Nx8 dvecIdx;
+  dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 15),
+                                      IVP_ADD2NX8U(IVP_SEQ2NX8(), 10), IVP_SELI_INTERLEAVE_2_LO),
+                         IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 5), IVP_SEQ2NX8(),
+                                      IVP_SELI_INTERLEAVE_2_LO), IVP_SELI_INTERLEAVE_4_LO);
+
+  dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(
+                                        IVP_MOVNX16_FROMN_2X32(4 + (9 << 8) + (14 << 16) + (19 << 24))),
+                                      IVP_ADD2NX8U(IVP_SEQ2NX8(), 20), IVP_SELI_INTERLEAVE_2_LO),
+                         dvecIdx, IVP_SELI_8B_PACK_16);
+
+  /* loop across output depth is unrolled by 4
+   * , producing lanes from 4 output channels
+   * in one iteration. Since vectorization width
+   * is just half the width of the accumulator,
+   * loop across output height is also unrolled by 2.
+   * Unrolling across output height makes it possible
+   * to utilize all the 64 MACs in the accumulator.
+   *
+   * Data loaded from the 2 input rows is concatenated
+   * in such a manner that lower half of the output
+   * vector gives the first output row and the upper
+   * half of the output vector gives the next output row.
+   */
+  for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */
+  {
+    for (y = 0; y < outH; y += 2)  /* Loop across output height */
+    {
+      /* In order to handle odd output heights */
+      int32_t enable2Row = XT_SALT(y, outH - 1);
+
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)];
+
+      /* initialize coeff and Bias data pointer */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 4)  /* Loop across Output depth */
+      {
+        /* In order to handle odd output depths */
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+        int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+        /* Load the bias values corresponding to two output channels */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+        xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+        dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+        IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+        /* priming for coeff load */
+        /* Coeff for 1st output channel */
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+        /* Coeff for 2nd output channel */
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+        /* Coeff for 3rd output channel */
+        pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh);
+        valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3);
+        /* Coeff for 4th output channel */
+        pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh);
+        valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4);
+
+
+        /* Input vector pointer initialization */
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+        pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+
+        /* vectors for coeff and input loads */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3, dvecCoeffData4;
+        xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, \
+                   dvecInData5, dvecInData6, dvecInData7;
+        MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+        MORPH_IDT_2Nx8 dvecData51, dvecData52, dvecData53, dvecData54, dvecData55;
+
+/**************************************** 1st inCh *********************************************/
+        /* load data from 1st input row */
+        valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData1, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* load data from 2nd input row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData2, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* load data from 3rd input row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData3, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* load data from 4th input row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData4, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* load data from 5th input row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData5, vaInData1, pdvecIn1, inDataPitch1 * enable2Row);
+
+        /* load data from 6th input row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData6, vaInData1, pdvecIn1, inDataPitch1 * enable2Row);
+
+        /* load data from 7th input row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData7, vaInData1, pdvecIn1, \
+                      inDataPitch2 - (4 + 2 * enable2Row) * inDataPitch1);
+
+        /* load all the 5x5 coefficients for 4 output depths*/
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch2);
+
+        /* Rearrange them so that max no. of qual multipliers can be used */
+        dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+        dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+        dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx);
+        dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx);
+
+        /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+         * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...63, and the 2nd input row is
+         * 64,65,66,67.........126,127, Data should be arranged  as
+         *
+         * dvecData1 : 0, 2, 4,...58,60,62,64,66,68,...122,124,126
+         * dvecData2 : 1, 3, 5,...59,61,63,65,67,69,...123,125,127
+         * dvecData3 : 3, 4, 6,...60,62,0 ,66,68,70,...124,126,0
+         * dvecData4 : 4, 6, 8,...61,63,0 ,67,69,71,...125,127,0
+         * dvecData5 : 5, 7, 9,...62,0 ,0 ,68,70,72,...126,0  ,0
+         *
+         * Lower half of the vectors contain data from 1st input row and
+         * upper half of the vectors contain data from 2nd output row.
+         *
+         */
+        IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+        dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+        dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* dvecData5 is kept separately and is used by quad multiplier finally */
+        dvecData51 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_2);
+
+        /* Values corresponding to first and second row are packed in one register
+            so that same coefficient will get multiplied to them */
+        /* Multiply and accumulate 4 coefficients from 1st set of 5 coeff for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+        MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+        MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0));
+
+
+        IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1);
+        IVP_DSEL2NX8I(dvecData52, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1);
+        dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* Multiply and accumulate 4 coefficients from 2nd set of 5 coeff for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+        MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+        MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+        MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1));
+
+
+        IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1);
+        IVP_DSEL2NX8I(dvecData53, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1);
+        dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* Multiply and accumulate 4 coefficients from 3rd set of 5 coeff for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+        MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+        MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+        MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2));
+
+
+        IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1);
+        IVP_DSEL2NX8I(dvecData54, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1);
+        dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* Multiply and accumulate 4 coefficients from 4th set of 5 coeff for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+        MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+        MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3));
+        MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3));
+
+
+        IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+        IVP_DSEL2NX8I(dvecData55, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1);
+        dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* Multiply and accumulate 4 coefficients from 5th set of 5 coeff for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+        MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+        MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 4));
+        MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 4));
+
+
+        /* Multiply and acc last coefficient from 1st 4 sets of coeffs for all the outputs*/
+        MORPH_OP_MULQA(dacc1, dvecData54, dvecData53, dvecData52, dvecData51,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+        MORPH_OP_MULQA(dacc2, dvecData54, dvecData53, dvecData52, dvecData51,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+        MORPH_OP_MULQA(dacc3, dvecData54, dvecData53, dvecData52, dvecData51,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 5));
+        MORPH_OP_MULQA(dacc4, dvecData54, dvecData53, dvecData52, dvecData51,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 5));
+
+
+        /* Multiply and acc last coefficient(24) with the last row from 2 output channels */
+        MORPH_OP_MULA(dacc1, dvecData55, IVP_EXTR2NX8(dvecCoeffData1, 24));
+        MORPH_OP_MULA(dacc2, dvecData55, IVP_EXTR2NX8(dvecCoeffData2, 24));
+        MORPH_OP_MULA(dacc3, dvecData55, IVP_EXTR2NX8(dvecCoeffData3, 24));
+        MORPH_OP_MULA(dacc4, dvecData55, IVP_EXTR2NX8(dvecCoeffData4, 24));
+
+/**************************************** 2nd inCh *********************************************/
+        /* load data from 1st input row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData1, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* load data from 2nd input row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData2, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* load data from 3rd input row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData3, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* load data from 4th input row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData4, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* load data from 5th input row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData5, vaInData1, pdvecIn1, inDataPitch1 * enable2Row);
+
+        /* load data from 6th input row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData6, vaInData1, pdvecIn1, inDataPitch1 * enable2Row);
+
+        /* load data from 7th input row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData7, vaInData1, pdvecIn1, \
+                      inDataPitch2 - (4 + 2 * enable2Row) * inDataPitch1);
+
+        /* load all the 5x5 coefficients for 4 output depths*/
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch2);
+
+        /* Rearrange them so that max no. of qual multipliers can be used */
+        dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+        dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+        dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx);
+        dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx);
+
+        IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+        dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+        dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* dvecData5 is kept separately and is used by quad multiplier finally */
+        dvecData51 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_2);
+
+        /* Values corresponding to first and second row are packed in one register
+            so that same coefficient will get multiplied to them */
+        /* Multiply and accumulate 4 coefficients from 1st set of 5 coeff for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+        MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+        MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0));
+
+
+        IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1);
+        IVP_DSEL2NX8I(dvecData52, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1);
+        dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* Multiply and accumulate 4 coefficients from 2nd set of 5 coeff for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+        MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+        MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+        MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1));
+
+
+        IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1);
+        IVP_DSEL2NX8I(dvecData53, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1);
+        dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* Multiply and accumulate 4 coefficients from 3rd set of 5 coeff for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+        MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+        MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+        MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2));
+
+
+        IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1);
+        IVP_DSEL2NX8I(dvecData54, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1);
+        dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* Multiply and accumulate 4 coefficients from 4th set of 5 coeff for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+        MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+        MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3));
+        MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3));
+
+
+        IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+        IVP_DSEL2NX8I(dvecData55, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1);
+        dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* Multiply and accumulate 4 coefficients from 5th set of 5 coeff for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+        MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+        MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 4));
+        MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 4));
+
+
+        /* Multiply and acc last coefficient from 1st 4 sets of coeffs for all the outputs*/
+        MORPH_OP_MULQA(dacc1, dvecData54, dvecData53, dvecData52, dvecData51,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+        MORPH_OP_MULQA(dacc2, dvecData54, dvecData53, dvecData52, dvecData51,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+        MORPH_OP_MULQA(dacc3, dvecData54, dvecData53, dvecData52, dvecData51,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 5));
+        MORPH_OP_MULQA(dacc4, dvecData54, dvecData53, dvecData52, dvecData51,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 5));
+
+
+        /* Multiply and acc last coefficient(24) with the last row from 2 output channels */
+        MORPH_OP_MULA(dacc1, dvecData55, IVP_EXTR2NX8(dvecCoeffData1, 24));
+        MORPH_OP_MULA(dacc2, dvecData55, IVP_EXTR2NX8(dvecCoeffData2, 24));
+        MORPH_OP_MULA(dacc3, dvecData55, IVP_EXTR2NX8(dvecCoeffData3, 24));
+        MORPH_OP_MULA(dacc4, dvecData55, IVP_EXTR2NX8(dvecCoeffData4, 24));
+
+/**************************************** 3rd inCh *********************************************/
+        /* load data from 1st input row */
+        valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        IVP_LA2NX8_XP(dvecInData1, vaInData2, pdvecIn2, inDataPitch1);
+
+        /* load data from 2nd input row */
+        vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        IVP_LA2NX8_XP(dvecInData2, vaInData2, pdvecIn2, inDataPitch1);
+
+        /* load data from 3rd input row */
+        vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        IVP_LA2NX8_XP(dvecInData3, vaInData2, pdvecIn2, inDataPitch1);
+
+        /* load data from 4th input row */
+        vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        IVP_LA2NX8_XP(dvecInData4, vaInData2, pdvecIn2, inDataPitch1);
+
+        /* load data from 5th input row */
+        vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        IVP_LA2NX8_XP(dvecInData5, vaInData2, pdvecIn2, inDataPitch1 * enable2Row);
+
+        /* load data from 6th input row */
+        vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        IVP_LA2NX8_XP(dvecInData6, vaInData2, pdvecIn2, inDataPitch1 * enable2Row);
+
+        /* load data from 7th input row */
+        vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        IVP_LA2NX8_XP(dvecInData7, vaInData2, pdvecIn2, \
+                      inDataPitch2 - (4 + 2 * enable2Row) * inDataPitch1);
+
+        /* load all the 5x5 coefficients for 4 output depths*/
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch2);
+
+        /* Rearrange them so that max no. of qual multipliers can be used */
+        dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+        dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+        dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx);
+        dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx);
+
+        IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+        dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+        dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* dvecData5 is kept separately and is used by quad multiplier finally */
+        dvecData51 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_2);
+
+        /* Values corresponding to first and second row are packed in one register
+            so that same coefficient will get multiplied to them */
+        /* Multiply and accumulate 4 coefficients from 1st set of 5 coeff for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+        MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+        MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0));
+
+
+        IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1);
+        IVP_DSEL2NX8I(dvecData52, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1);
+        dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* Multiply and accumulate 4 coefficients from 2nd set of 5 coeff for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+        MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+        MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+        MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1));
+
+
+        IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1);
+        IVP_DSEL2NX8I(dvecData53, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1);
+        dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* Multiply and accumulate 4 coefficients from 3rd set of 5 coeff for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+        MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+        MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+        MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2));
+
+        IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1);
+        IVP_DSEL2NX8I(dvecData54, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1);
+        dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* Multiply and accumulate 4 coefficients from 4th set of 5 coeff for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+        MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+        MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3));
+        MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3));
+
+
+        IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+        IVP_DSEL2NX8I(dvecData55, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1);
+        dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+        /* Multiply and accumulate 4 coefficients from 5th set of 5 coeff for all the outputs */
+        MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+        MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+        MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 4));
+        MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 4));
+
+
+        /* Multiply and acc last coefficient from 1st 4 sets of coeffs for all the outputs*/
+        MORPH_OP_MULQA(dacc1, dvecData54, dvecData53, dvecData52, dvecData51,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+        MORPH_OP_MULQA(dacc2, dvecData54, dvecData53, dvecData52, dvecData51,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+        MORPH_OP_MULQA(dacc3, dvecData54, dvecData53, dvecData52, dvecData51,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 5));
+        MORPH_OP_MULQA(dacc4, dvecData54, dvecData53, dvecData52, dvecData51,
+                       IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                        IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 5));
+
+
+        /* Multiply and acc last coefficient(24) with the last row from 2 output channels */
+        MORPH_OP_MULA(dacc1, dvecData55, IVP_EXTR2NX8(dvecCoeffData1, 24));
+        MORPH_OP_MULA(dacc2, dvecData55, IVP_EXTR2NX8(dvecCoeffData2, 24));
+        MORPH_OP_MULA(dacc3, dvecData55, IVP_EXTR2NX8(dvecCoeffData3, 24));
+        MORPH_OP_MULA(dacc4, dvecData55, IVP_EXTR2NX8(dvecCoeffData4, 24));
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh ], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh ], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh + 3 * enable4thCh ], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable length for output stores */
+        int32_t varLen = XT_MIN(vectorizationWidth, outW - x);
+
+        /* Storing the first output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the third output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the fourth output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable4thCh * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut1L, dvecOut1L, IVP_SELI_EXTRACT_HI_HALVES),
+                       vaOutData, pdvecOut, enable2Row * (-typeFlag + 1) * varLen);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, enable2Row * typeFlag * 2 * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2Row) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut2L, dvecOut2L, IVP_SELI_EXTRACT_HI_HALVES),
+                       vaOutData, pdvecOut, enable2ndCh * enable2Row * (-typeFlag + 1) * varLen);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, enable2ndCh * \
+                       enable2Row * typeFlag * 2 * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the third output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              outDataPitch1 * enable2Row) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut3L, dvecOut3L, IVP_SELI_EXTRACT_HI_HALVES),
+                       vaOutData, pdvecOut, enable3rdCh * enable2Row * (-typeFlag + 1) * varLen);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, enable3rdCh * \
+                       enable2Row * typeFlag * 2 * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the fourth output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              outDataPitch1 * enable2Row) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut4L, dvecOut4L, IVP_SELI_EXTRACT_HI_HALVES),
+                       vaOutData, pdvecOut, enable4thCh * enable2Row * (-typeFlag + 1) * varLen);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, enable4thCh * \
+                       enable2Row * typeFlag * 2 * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 4 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 4 * coeffPitch3;
+      } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4) */
+    }   /* end of for (y = 0; y < outH; y += 2) */
+  }     /* end of for (x = 0; x < outW; x += vectorizationWidth) */
+}
+/******************************************************************************************
+*  xaiConvolved(VQ)3D_S_5x5j2d1I8S8IX_MOW_WHD
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 5x5 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 5x5 3D dilated convolution function and 5x5 */
+/*               3D VQ dilated convolution function for U8 bit and S8 bit     */
+/*               input data with input stride equal to 2                      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 5x5xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_5x5j2d1_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_5x5j2d1_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_5x5j2d1_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_5x5j2d1_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_5x5j2d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 5);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                           \
+                    XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE3D_EDGE(inTile, 2);
+    XAI_CHECK_STRIDE(param, 2);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  if (XAI_TILE3D_GET_DIM3(inTile) == 3)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_5x5j2d1), S8IX_MOW_WHD_DEPTH3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8* restrict pdvecIn1;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+  xb_vec2Nx8* restrict pdvecCoeff3;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+  /* In order to make the loop multiply-bound we are reducing the vectorization width
+     by extra values required for the kernel */
+  const int32_t vectorizationWidth = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1;
+
+  /* Since there are 25 coefficients for 1 output channel, we can make use of 6 quad multipliers
+   * for generating 1 output. So we need to re-arrange the 25 coefficients in the pattern shown
+   * Pattern : 0 1 2 3 5 6 7 8 10 11 12 13 15 16 17 18 20 21 22 23 4 9 14 19 24 */
+  xb_vec2Nx8 dvecIdx;
+  dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 15),
+                                      IVP_ADD2NX8U(IVP_SEQ2NX8(), 10), IVP_SELI_INTERLEAVE_2_LO),
+                         IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 5), IVP_SEQ2NX8(),
+                                      IVP_SELI_INTERLEAVE_2_LO), IVP_SELI_INTERLEAVE_4_LO);
+
+  dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(
+                                        IVP_MOVNX16_FROMN_2X32(4 + (9 << 8) + (14 << 16) + (19 << 24))),
+                                      IVP_ADD2NX8U(IVP_SEQ2NX8(), 20), IVP_SELI_INTERLEAVE_2_LO),
+                         dvecIdx, IVP_SELI_8B_PACK_16);
+
+  /* loop across output depth is unrolled by 3
+   * , producing lanes from 3 output channels
+   * in one iteration. Since vectorization width
+   * is just half the width of the accumulator,
+   * loop across output height is also unrolled by 2.
+   * Unrolling across output height makes it possible
+   * to utilize all the 64 MACs in the accumulator.
+   *
+   * Data loaded from the 2 input rows is concatenated
+   * in such a manner that lower half of the output
+   * vector gives the first output row and the upper
+   * half of the output vector gives the next output row.
+   */
+  for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */
+  {
+    for (y = 0; y < outH; y += 2) /* Loop across output height */
+    {
+      /* In order to handle odd output heights */
+      int32_t enable2Row = XT_SALT(y, outH - 1);
+
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)];
+
+      /* initialize coeff and Bias data pointer */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 3)  /* Loop across Output depth */
+      {
+        /* In order to handle odd output depths */
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+
+        /* Load the bias values corresponding to two output channels */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2, dacc3;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        /* Coeff for 1st output channel */
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+        /* Coeff for 2nd output channel */
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+        /* Coeff for 3rd output channel */
+        pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh);
+        valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3);
+
+        /* Input vector pointer initialization */
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+
+        for (inCh = 0; inCh < numInCh; inCh++)  /* Loop across input channels */
+        {
+          /* vectors for coeff and input loads */
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3;
+          xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, \
+                     dvecInData5, dvecInData6, dvecInData7;
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+          MORPH_IDT_2Nx8 dvecData51, dvecData52, dvecData53, dvecData54, dvecData55;
+
+          /* load data from 1st input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LA2NX8_XP(dvecInData1, vaInData, pdvecIn1, inDataPitch1);
+
+          /* load data from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LA2NX8_XP(dvecInData2, vaInData, pdvecIn1, inDataPitch1);
+
+          /* load data from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LA2NX8_XP(dvecInData3, vaInData, pdvecIn1, inDataPitch1);
+
+          /* load data from 4th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LA2NX8_XP(dvecInData4, vaInData, pdvecIn1, inDataPitch1);
+
+          /* load data from 5th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LA2NX8_XP(dvecInData5, vaInData, pdvecIn1, inDataPitch1 * enable2Row);
+
+          /* load data from 6th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LA2NX8_XP(dvecInData6, vaInData, pdvecIn1, inDataPitch1 * enable2Row);
+
+          /* load data from 7th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          IVP_LA2NX8_XP(dvecInData7, vaInData, pdvecIn1, \
+                        inDataPitch2 - (4 + 2 * enable2Row) * inDataPitch1);
+
+          /* load all the 5x5 coefficients for 2 output depths*/
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2);
+
+          /* Rearrange them so that max no. of qual multipliers can be used */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+          dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx);
+
+          /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+           * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...63, and the 2nd input row is
+           * 64,65,66,67.........126,127, Data should be arranged  as
+           *
+           * dvecData1 : 0, 2, 4,...58,60,62,64,66,68,...122,124,126
+           * dvecData2 : 1, 3, 5,...59,61,63,65,67,69,...123,125,127
+           * dvecData3 : 3, 4, 6,...60,62,0 ,66,68,70,...124,126,0
+           * dvecData4 : 4, 6, 8,...61,63,0 ,67,69,71,...125,127,0
+           * dvecData5 : 5, 7, 9,...62,0 ,0 ,68,70,72,...126,0  ,0
+           *
+           * Lower half of the vectors contain data from 1st input row and
+           * upper half of the vectors contain data from 2nd output row.
+           *
+           */
+
+          IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+          dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+          dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          /* dvecData5 is kept separately and is used by quad multiplier finally */
+          dvecData51 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_2);
+
+          /* Values corresponding to first and second row are packed in one register
+             so that same coefficient will get multiplied to them */
+          /* Multiply and accumulate 4 coefficients from 1st set of 5 coeff for all the outputs */
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+
+
+          IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData52, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1);
+          dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          /* Multiply and accumulate 4 coefficients from 2nd set of 5 coeff for all the outputs */
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+
+
+          IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData53, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1);
+          dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          /* Multiply and accumulate 4 coefficients from 3rd set of 5 coeff for all the outputs */
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+
+
+          IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData54, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1);
+          dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          /* Multiply and accumulate 4 coefficients from 4th set of 5 coeff for all the outputs */
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3));
+
+
+          IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData55, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1);
+          dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          /* Multiply and accumulate 4 coefficients from 5th set of 5 coeff for all the outputs */
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 4));
+
+
+          /* Multiply and acc last coefficient from 1st 4 sets of coeffs for all the outputs*/
+          MORPH_OP_MULQA(dacc1, dvecData54, dvecData53, dvecData52, dvecData51,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+          MORPH_OP_MULQA(dacc2, dvecData54, dvecData53, dvecData52, dvecData51,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+          MORPH_OP_MULQA(dacc3, dvecData54, dvecData53, dvecData52, dvecData51,
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \
+                                          IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 5));
+
+
+          /* Multiply and acc last coefficient(24) with the last row from 2 output channels */
+          MORPH_OP_MULA(dacc1, dvecData55, IVP_EXTR2NX8(dvecCoeffData1, 24));
+          MORPH_OP_MULA(dacc2, dvecData55, IVP_EXTR2NX8(dvecCoeffData2, 24));
+          MORPH_OP_MULA(dacc3, dvecData55, IVP_EXTR2NX8(dvecCoeffData3, 24));
+        }  /* end of for (inCh = 0; inCh < numInCh; inCh++) */
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh ], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable length for output stores */
+        int32_t varLen = XT_MIN(vectorizationWidth, outW - x);
+
+        /* Storing the first output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the third output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+
+        /* Storing the first output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut1L, dvecOut1L, IVP_SELI_EXTRACT_HI_HALVES),
+                       vaOutData, pdvecOut, enable2Row * (-typeFlag + 1) * varLen);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, enable2Row * typeFlag * 2 * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2Row) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut2L, dvecOut2L, IVP_SELI_EXTRACT_HI_HALVES),
+                       vaOutData, pdvecOut, enable2ndCh * enable2Row * (-typeFlag + 1) * varLen);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, enable2ndCh * \
+                       enable2Row * typeFlag * 2 * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the third output depth, second row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              outDataPitch1 * enable2Row) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut3L, dvecOut3L, IVP_SELI_EXTRACT_HI_HALVES),
+                       vaOutData, pdvecOut, enable3rdCh * enable2Row * (-typeFlag + 1) * varLen);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, enable3rdCh * \
+                       enable2Row * typeFlag * 2 * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 3 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 3 * coeffPitch3;
+      } /* end of for (outCh = 0; outCh < numOutCh; outCh += 3) */
+    }   /* end of for (y = 0; y < outH; y += 2) */
+  }     /* end of for (x = 0; x < outW; x += vectorizationWidth) */
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+*   xaiConvolved(VQ)3D_S_5x5j4d1I8S8IX_MOW_WHD
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 5x5 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 5x5 3D dilated convolution function and 5x5 */
+/*               3D VQ dilated convolution function for U8 bit and S8 bit     */
+/*               input data with input stride equal to 4                      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 5x5xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_5x5j4d1_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_5x5j4d1_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_5x5j4d1_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_5x5j4d1_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_5x5j4d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 5);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                           \
+                    XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE3D_EDGE(inTile, 2);
+    XAI_CHECK_STRIDE(param, 4);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+  /* Getting parameters from the tile structures */
+  const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \
+                      XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile);
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Pitches of Coefficient Data (WHDN) dim3 */
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 *restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2, * restrict pdvecCoeff3;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+  /* Number of output elements that can be generated
+   * with 2 input vector loads(64 way).*/
+  const int32_t vectorizationWidth = (((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1;
+
+  /* generates the sequence 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18
+   * , 20,21,22,23 ,4,9,14,19 ,24. To be used to shuffle the coeff data,
+   * So that last coeff from first 4 rows of coeffs can be used as one
+   * 32 byte element and make use of quad multiplier outside the inner-
+   * most loop.
+   * c11, c12, c13, c14, c15
+   * c21, c22, c23, c24, c25
+   * c31, c32, c33, c34, c35
+   * c41, c42, c43, c44, c45
+   * c51, c52, c53, c54, c55
+   *
+   * c15, c25, c35, c45 and c55 are placed in contigous fashion, so that
+   * c15, c25, c35, and c45 can be used as one 32 byte element
+   */
+  xb_vec2Nx8 dvecIdx;
+  dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 15),
+                                      IVP_ADD2NX8U(IVP_SEQ2NX8(), 10),
+                                      IVP_SELI_INTERLEAVE_2_LO),
+                         IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 5), IVP_SEQ2NX8(),
+                                      IVP_SELI_INTERLEAVE_2_LO),
+                         IVP_SELI_INTERLEAVE_4_LO);
+
+  dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(
+                                        IVP_MOVNX16_FROMN_2X32(4 + (9 << 8) + (14 << 16) + \
+                                                               (19 << 24))),
+                                      IVP_ADD2NX8U(IVP_SEQ2NX8(), 20), IVP_SELI_INTERLEAVE_2_LO),
+                         dvecIdx, IVP_SELI_8B_PACK_16);
+
+  /* loop across output depth is unrolled by 3
+   * , producing lanes from 3 output channels
+   * in one iteration. Since vectorization width
+   * is just half the width of the accumulator,
+   * loop across output height is also unrolled by 2.
+   * Unrolling across output height makes it possible
+   * to utilize all the 64 MACs in the accumulator.
+   *
+   * Data loaded from the 2 input rows is concatenated
+   * in such a manner that lower half of the output
+   * vector gives the first output row and the upper
+   * half of the output vector gives the next output row.
+   */
+  for (x = 0; x < outW; x += vectorizationWidth)      /* Loop across Output width */
+  {
+    /* out of bound flag */
+    int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+
+    for (y = 0; y < outH - 1; y += 2)    /* Loop across Output height */
+    {
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x];
+
+      /* initialize coeff & bias data pointer to outCh kernel */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 3) /* Loop across Output depth */
+      {
+        /* In order to handle odd output depths */
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+
+        /* loads and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2, dacc3;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh);
+        valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3);
+
+        pdvecIn = (MORPH_IDT_2Nx8 *) (pInput);
+
+        for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+        {
+          /* variable declarations for input and coeff vectors */
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3;
+
+          /* load coeff for all the 4 outptu channels*/
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2);
+
+          /* shuffles the loaded coeff put them in proper order */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+          dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx);
+
+          /* loads 1st input row */
+          MORPH_IDT_2Nx8 dvecInData11, dvecInData12;
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \
+                             inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+          /* loads 2nd input row */
+          MORPH_IDT_2Nx8 dvecInData21, dvecInData22;
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \
+                             inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+          /* loads 3rd input row */
+          MORPH_IDT_2Nx8 dvecInData31, dvecInData32;
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \
+                             inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+          /* loads 4th input row */
+          MORPH_IDT_2Nx8 dvecInData41, dvecInData42;
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \
+                             inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+          /* loads 5th input row */
+          MORPH_IDT_2Nx8 dvecInData51, dvecInData52;
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \
+                             inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+          /* loads 6th input row */
+          MORPH_IDT_2Nx8 dvecInData61, dvecInData62;
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, \
+                             inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+          /* loads 7th input row */
+          MORPH_IDT_2Nx8 dvecInData71, dvecInData72;
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData71, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData72, vaInData, pdvecIn, \
+                             inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+          /* loads 8th input row */
+          MORPH_IDT_2Nx8 dvecInData81, dvecInData82;
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData81, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData82, vaInData, pdvecIn, \
+                             inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+          /* loads 9th input row */
+          MORPH_IDT_2Nx8 dvecInData91, dvecInData92;
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData91, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData92, vaInData, pdvecIn, \
+                             inDataPitch2 - 8 * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+          /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+           * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is
+           * 128,129,130,131.........252,253,254,255, Data should be arranged  as
+           *
+           * dvecData1 : 0, 4, 8,...120,124,128,132,136,...248,252
+           * dvecData2 : 1, 5, 9,...121,125,129,133,137,...249,253
+           * dvecData3 : 2, 6,10,...122,126,130,134,138,...250,254
+           * dvecData4 : 3, 7,11,...123,127,131,135,139,...251,255
+           * dvecData5 : 4, 8,11,...124,0  ,132,136,140,...252,0
+           *
+           * Lower half of the vectors contain data from 1st input row and
+           * upper half of the vectors contain data from 2nd output row.
+           *
+           */
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+          MORPH_IDT_2Nx8 dvecData51, dvecData52, dvecData53, dvecData54, dvecData55;
+
+          IVP_DSEL2NX8I(dvecData2, dvecData1,
+                        IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData3,
+                        IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* dvecData5 is kept separately and is used by quad multiplier finally */
+          dvecData51 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          /* multiplies data from two rows(Lower and upper half of dvecData)
+           * with coeff from all three output channels and accumulate. Lower
+           * half of the accumulators contain data corresponding to the first
+           * output row and upper half contains next output row */
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+
+          /* Calculations for second row */
+          IVP_DSEL2NX8I(dvecData2, dvecData1,
+                        IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData3,
+                        IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          dvecData52 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+
+          /* Calculations for third row */
+          IVP_DSEL2NX8I(dvecData2, dvecData1,
+                        IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData3,
+                        IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          dvecData53 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+
+          /* Calculations for fourth row */
+          IVP_DSEL2NX8I(dvecData2, dvecData1,
+                        IVP_SEL2NX8I(dvecInData82, dvecInData81, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData3,
+                        IVP_SEL2NX8I(dvecInData82, dvecInData81, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          dvecData54 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3));
+
+          /* Calculations for fifth row */
+          IVP_DSEL2NX8I(dvecData2, dvecData1,
+                        IVP_SEL2NX8I(dvecInData92, dvecInData91, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData3,
+                        IVP_SEL2NX8I(dvecInData92, dvecInData91, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          dvecData55 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 4));
+
+          /* multiplies last coeffs of 1st four rows with the input data */
+          MORPH_OP_MULQA(dacc1, dvecData54, dvecData53, dvecData52, dvecData51, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+          MORPH_OP_MULQA(dacc2, dvecData54, dvecData53, dvecData52, dvecData51, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+          MORPH_OP_MULQA(dacc3, dvecData54, dvecData53, dvecData52, dvecData51, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 5));
+
+          /* multiplies last coeff(24th) with the input data */
+          MORPH_OP_MULA(dacc1, dvecData55, IVP_EXTR2NX8(dvecCoeffData1, 24));
+          MORPH_OP_MULA(dacc2, dvecData55, IVP_EXTR2NX8(dvecCoeffData2, 24));
+          MORPH_OP_MULA(dacc3, dvecData55, IVP_EXTR2NX8(dvecCoeffData3, 24));
+        }     /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable store count */
+        int32_t varLen = XT_MIN(outW - x, vectorizationWidth);
+
+        /* Storing the first row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* extract the upper half of the output vectors
+         * and store in the next row
+         */
+
+        /* Storing the 2nd row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable2ndCh);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              outDataPitch1) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable3rdCh);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable3rdCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 3 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 3 * coeffPitch3;
+      }  /* end of (outCh = 0; outCh < numOutCh; outCh += 3)*/
+    }    /* end of for (y = 0; y < outH - 1; y += 2)*/
+    if (y < outH)
+    {
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x];
+
+      /* initialize coeff & bias data pointer to outCh kernel */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 3) /* Loop across Output depth */
+      {
+        /* In order to handle odd output depths */
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+
+        /* loads and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2, dacc3;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh);
+        valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3);
+
+        pdvecIn = (MORPH_IDT_2Nx8 *) (pInput);
+
+        for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+        {
+          /* variable declarations for input and coeff vectors */
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3;
+
+          /* load coeff for all the 4 outptu channels*/
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2);
+
+          /* shuffles the loaded coeff put them in proper order */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+          dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx);
+
+          /* loads 1st input row */
+          MORPH_IDT_2Nx8 dvecInData11, dvecInData12;
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \
+                             inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+          /* loads 2nd input row */
+          MORPH_IDT_2Nx8 dvecInData21, dvecInData22;
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \
+                             inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+          /* loads 3rd input row */
+          MORPH_IDT_2Nx8 dvecInData31, dvecInData32;
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \
+                             inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+          /* loads 4th input row */
+          MORPH_IDT_2Nx8 dvecInData41, dvecInData42;
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \
+                             inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+          /* loads 5th input row */
+          MORPH_IDT_2Nx8 dvecInData51, dvecInData52;
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, inDataPitch2 - 4 * inDataPitch1 - \
+                             2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+          /*
+           * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127,
+           *
+           * dvecData1 : 0, 4, 8,...120,124
+           * dvecData2 : 1, 5, 9,...121,125
+           * dvecData3 : 2, 6,10,...122,126
+           * dvecData4 : 3, 7,11,...123,127
+           * dvecData5 : 4, 8,11,...124,0
+           *
+           */
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+          MORPH_IDT_2Nx8 dvecData51, dvecData52, dvecData53, dvecData54, dvecData55;
+
+          IVP_DSEL2NX8I(dvecData2, dvecData1,
+                        0,
+                        IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData3,
+                        0,
+                        IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* dvecData5 is kept separately and is used by quad multiplier finally */
+          dvecData51 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          /* multiplies data from input row with coeff from
+           * all three output channels and accumulate. */
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+
+          /* Calculations for second row */
+          IVP_DSEL2NX8I(dvecData2, dvecData1,
+                        0,
+                        IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData3,
+                        0,
+                        IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          dvecData52 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+
+          /* Calculations for third row */
+          IVP_DSEL2NX8I(dvecData2, dvecData1,
+                        0,
+                        IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData3,
+                        0,
+                        IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          dvecData53 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+
+          /* Calculations for fourth row */
+          IVP_DSEL2NX8I(dvecData2, dvecData1,
+                        0,
+                        IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData3,
+                        0,
+                        IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          dvecData54 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3));
+
+          /* Calculations for fifth row */
+          IVP_DSEL2NX8I(dvecData2, dvecData1,
+                        0,
+                        IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData3,
+                        0,
+                        IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          dvecData55 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 4));
+
+          /* multiplies last coeffs of 1st four rows with the input data */
+          MORPH_OP_MULQA(dacc1, dvecData54, dvecData53, dvecData52, dvecData51, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+          MORPH_OP_MULQA(dacc2, dvecData54, dvecData53, dvecData52, dvecData51, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+          MORPH_OP_MULQA(dacc3, dvecData54, dvecData53, dvecData52, dvecData51, IVP_EXTRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 5));
+
+          /* multiplies last coeff(24th) with the input data */
+          MORPH_OP_MULA(dacc1, dvecData55, IVP_EXTR2NX8(dvecCoeffData1, 24));
+          MORPH_OP_MULA(dacc2, dvecData55, IVP_EXTR2NX8(dvecCoeffData2, 24));
+          MORPH_OP_MULA(dacc3, dvecData55, IVP_EXTR2NX8(dvecCoeffData3, 24));
+        }     /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable store count */
+        int32_t varLen = XT_MIN(outW - x, vectorizationWidth);
+
+        /* Storing the first row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 3 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 3 * coeffPitch3;
+      }  /* end of (outCh = 0; outCh < numOutCh; outCh += 3)*/
+    }    /* end of if(y < outH)*/
+  }      /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  return(XAI_ERROR_STATUS());
+}
+
+
+/******************************************************************************************
+*   xaiConvolved(VQ)3D_S_5x5j1d2I8S8IX_MOW_WHD
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 5x5 3D convolution   */
+/*               with dilation = 2                                            */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 5x5 3D dilated convolution function and 5x5 */
+/*               3D VQ dilated convolution function for U8 bit and S8 bit     */
+/*               input data with input stride equal to 1                      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 5x5xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_5x5j1d2_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_5x5j1d2_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_5x5j1d2_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_5x5j1d2_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_5x5j1d2), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 5);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_TILE3D_EDGE(inTile, 4);
+    XAI_CHECK_STRIDE(param, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                         \
+                    XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_DILATION(param, 2);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim3 */
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN) */
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t dilationU     = XAI_CNN_CONV_GET_DILATION(param);
+
+  int32_t dilatedkSizeU = dilationU * (kSizeU - 1) + 1;
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((dilatedkSizeU / 2) * inDataPitch1 + (dilatedkSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 * restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+
+  /* In order to make the loop multiply-bound we are reducing the vectorization width
+     by extra values required for the kernel */
+  const int32_t vectorizationWidth = ((4 * XCHAL_IVPN_SIMD_WIDTH) - dilatedkSizeU) + 1;
+
+  /* generates the sequence 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18
+   * , 20,21,22,23 ,4,9,14,19 ,24. To be used to shuffle the coeff data,
+   * So that last coeff from first 4 rows of coeffs can be used as one
+   * 32 bit element and make use of quad multiplier outside the inner-
+   * most loop.
+   * c11, c12, c13, c14, c15
+   * c21, c22, c23, c24, c25
+   * c31, c32, c33, c34, c35
+   * c41, c42, c43, c44, c45
+   * c51, c52, c53, c54, c55
+   *
+   * c15, c25, c35, c45 and c55 are placed in contigous fashion, so that
+   * c15, c25, c35, and c45 can be used as one 32 bit element
+   */
+  xb_vec2Nx8 dvecIdx;
+  dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 15),
+                                      IVP_ADD2NX8U(IVP_SEQ2NX8(), 10),
+                                      IVP_SELI_INTERLEAVE_2_LO),
+                         IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 5), IVP_SEQ2NX8(),
+                                      IVP_SELI_INTERLEAVE_2_LO),
+                         IVP_SELI_INTERLEAVE_4_LO);
+
+  dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(
+                                        IVP_MOVNX16_FROMN_2X32(4 + (9 << 8) + (14 << 16) + \
+                                                               (19 << 24))),
+                                      IVP_ADD2NX8U(IVP_SEQ2NX8(), 20), IVP_SELI_INTERLEAVE_2_LO),
+                         dvecIdx, IVP_SELI_8B_PACK_16);
+
+  /* loop across output height is unrolled twice and loop across kernel width and height is
+     completely unrolled*/
+
+  /* 0 1 2 3 .. 62 63*/
+  xb_vec2Nx8 dvecPattern1 = IVP_SEQ2NX8();
+  /* 64 65 66 ...126 127*/
+  xb_vec2Nx8 dvecPattern2 = IVP_ADD2NX8(dvecPattern1, 64);
+
+  if (!typeFlag)
+  {
+    MORPH_OP_DSELI(dvecPattern2, dvecPattern1, \
+                   dvecPattern2, dvecPattern1, \
+                   IVP_DSELI_8B_INTERLEAVE_1);
+  }
+  else
+  {
+    MORPH_OP_DSELI(dvecPattern2, dvecPattern1, \
+                   dvecPattern2, dvecPattern1, \
+                   IVP_DSELI_INTERLEAVE_1);
+  }
+  for (x = 0; x < outW; x += vectorizationWidth)  /* Loop across output width */
+  {
+    int32_t remX = XT_MIN(vectorizationWidth, outW - x);
+
+    /* If (remX + kSizeEffU - 1) <= 2 * XCHAL_IVPN_SIMD_WIDTH,
+     * i.e. if the number of input data bytes corresponding to remX number of outputs
+     * is less than or equal to 2 * XCHAL_IVPN_SIMD_WIDTH, there is no need to load
+     * the next 64 input bytes*/
+    int32_t remXLoad = ((remX + dilatedkSizeU - 1) > 2 * XCHAL_IVPN_SIMD_WIDTH) ? 1 : 0;
+
+    for (y = 0; y < outH; y++) /* Loop across output height */
+    {
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+      /* initialize coeff and Bias data pointer*/
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 2)  /* Loop across Output depth */
+      {
+        /* In order to handle odd output depths */
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+        /* Load the bias values corresponding to two output channels */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+        dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+        dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        pdvecIn = (MORPH_IDT_2Nx8 *) pInput;
+
+        for (inCh = 0; inCh < numInCh; inCh++)  /* Loop across input channels */
+        {
+          /* vectors for coeff and input loads */
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+          xb_vec2Nx8 dvecInData11, dvecInData21, dvecInData31, dvecInData41, dvecInData51;
+          xb_vec2Nx8 dvecInData12, dvecInData22, dvecInData32, dvecInData42, dvecInData52;
+
+          /* load data from first input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* load data from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* load data from 4th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* load data from 5th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, inDataPitch2 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH - dilationU * 4 * inDataPitch1);
+
+          /* load all the 5x5 coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+
+          /* Rearrange them so that 5 MUL4T,1 MULQ,1 MUL can be used to perform entire operation */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+
+          /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, 0, IVP_SEL2NX8I(dvecInData12, dvecInData11,                        \
+                                                  IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(dvecInData12, dvecInData11,                        \
+                                                  IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          MORPH_OP_MUL4TA(dacc21, 0, IVP_SEL2NX8I(dvecInData12, dvecInData11,                        \
+                                                  IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(dvecInData12, dvecInData11,                        \
+                                                  IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, 0, dvecInData21, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc21, 0, dvecInData21, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+          MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+          /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, 0, IVP_SEL2NX8I(dvecInData32, dvecInData31,                        \
+                                                  IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(dvecInData32, dvecInData31,                        \
+                                                  IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+          MORPH_OP_MUL4TA(dacc21, 0, IVP_SEL2NX8I(dvecInData32, dvecInData31,                        \
+                                                  IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+          MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(dvecInData32, dvecInData31,                        \
+                                                  IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+
+          /* Multiply and accumulate 4th set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, 0, IVP_SEL2NX8I(dvecInData42, dvecInData41,                        \
+                                                  IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+          MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(dvecInData42, dvecInData41,                        \
+                                                  IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+
+          MORPH_OP_MUL4TA(dacc21, 0, IVP_SEL2NX8I(dvecInData42, dvecInData41,                        \
+                                                  IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+          MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(dvecInData42, dvecInData41,                        \
+                                                  IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+
+          /* Multiply and accumulate 5th set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, 0, IVP_SEL2NX8I(dvecInData52, dvecInData51,                        \
+                                                  IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+          MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(dvecInData52, dvecInData51,                        \
+                                                  IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+
+          MORPH_OP_MUL4TA(dacc21, 0, IVP_SEL2NX8I(dvecInData52, dvecInData51,                        \
+                                                  IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+          MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(dvecInData52, dvecInData51,                        \
+                                                  IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+
+          /* Multiply and accumulate 6th set of 4 coefficients for all the outputs */
+          MORPH_OP_MULQA(dacc11,                                                             \
+                         IVP_SEL2NX8I(dvecInData41, IVP_SEL2NX8I(dvecInData42, dvecInData41, \
+                                                                 IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData31, IVP_SEL2NX8I(dvecInData32, dvecInData31, \
+                                                                 IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData11, IVP_SEL2NX8I(dvecInData12, dvecInData11, \
+                                                                 IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+          MORPH_OP_MULQA(dacc12,                                                             \
+                         IVP_SEL2NX8I(dvecInData42, IVP_SEL2NX8I(dvecInData42, dvecInData41, \
+                                                                 IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData32, IVP_SEL2NX8I(dvecInData32, dvecInData31, \
+                                                                 IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData12, IVP_SEL2NX8I(dvecInData12, dvecInData11, \
+                                                                 IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+          MORPH_OP_MULQA(dacc21,                                                             \
+                         IVP_SEL2NX8I(dvecInData41, IVP_SEL2NX8I(dvecInData42, dvecInData41, \
+                                                                 IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData31, IVP_SEL2NX8I(dvecInData32, dvecInData31, \
+                                                                 IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData11, IVP_SEL2NX8I(dvecInData12, dvecInData11, \
+                                                                 IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+
+          MORPH_OP_MULQA(dacc22,                                                             \
+                         IVP_SEL2NX8I(dvecInData42, IVP_SEL2NX8I(dvecInData42, dvecInData41, \
+                                                                 IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData32, IVP_SEL2NX8I(dvecInData32, dvecInData31, \
+                                                                 IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData12, IVP_SEL2NX8I(dvecInData12, dvecInData11, \
+                                                                 IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+
+          /* Multiply and accumulate the final coefficient for all the outputs */
+          MORPH_OP_MULA(dacc11, IVP_SEL2NX8I(dvecInData51,                                                                \
+                                             IVP_SEL2NX8I(dvecInData52, dvecInData51,                                     \
+                                                          IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTR2NX8(dvecCoeffData1, 24));
+          MORPH_OP_MULA(dacc12, IVP_SEL2NX8I(dvecInData52,                                                                \
+                                             IVP_SEL2NX8I(dvecInData52, dvecInData51,                                     \
+                                                          IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTR2NX8(dvecCoeffData1, 24));
+
+          MORPH_OP_MULA(dacc21, IVP_SEL2NX8I(dvecInData51,                                                                \
+                                             IVP_SEL2NX8I(dvecInData52, dvecInData51,                                     \
+                                                          IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTR2NX8(dvecCoeffData2, 24));
+          MORPH_OP_MULA(dacc22, IVP_SEL2NX8I(dvecInData52,                                                                \
+                                             IVP_SEL2NX8I(dvecInData52, dvecInData51,                                     \
+                                                          IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTR2NX8(dvecCoeffData2, 24));
+        }  /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvec1L, dvec2L, dvec3L, dvec4L;
+        xb_vec2Nx8 dvec1H, dvec2H, dvec3H, dvec4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Interleave odd and even indices */
+        xb_vec2Nx8 dvecOut1L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern1);
+        xb_vec2Nx8 dvecOut2L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern2);
+        xb_vec2Nx8 dvecOut1H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern1);
+        xb_vec2Nx8 dvecOut2H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern2);
+        xb_vec2Nx8 dvecOut3L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern1);
+        xb_vec2Nx8 dvecOut4L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern2);
+        xb_vec2Nx8 dvecOut3H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern1);
+        xb_vec2Nx8 dvecOut4H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern2);
+
+        /* Storing the first output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remX - \
+                       2 * XCHAL_IVPN_SIMD_WIDTH);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remX * enable2ndCh);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * remX) - \
+                                                        2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 2 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 2 * coeffPitch3;
+      } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/
+    }   /* end of for (y = 0; y < outH; y++)*/
+  }     /* end of for (x = 0; x < outW; x += 2 * vectorizationWidth)*/
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+*   xaiConvolved(VQ)3D_S_5x5j1d4I8S8IX_MOW_WHD
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 5x5 3D convolution   */
+/*               with dilation = 4                                            */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 5x5 3D dilated convolution function and 5x5 */
+/*               3D VQ dilated convolution function for U8 bit and S8 bit     */
+/*               input data with input stride equal to 1                      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 5x5xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_5x5j1d4_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_5x5j1d4_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_5x5j1d4_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_5x5j1d4_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_5x5j1d4), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 5);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_TILE3D_EDGE(inTile, 8);
+    XAI_CHECK_STRIDE(param, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                         \
+                    XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_DILATION(param, 4);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim3 */
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN) */
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t dilationU     = XAI_CNN_CONV_GET_DILATION(param);
+
+  int32_t dilatedkSizeU = dilationU * (kSizeU - 1) + 1;
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((dilatedkSizeU / 2) * inDataPitch1 + (dilatedkSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 * restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+
+
+  /* In order to make the loop multiply-bound we are reducing the vectorization width
+     by extra values required for the kernel */
+  const int32_t vectorizationWidth = ((4 * XCHAL_IVPN_SIMD_WIDTH) - dilatedkSizeU) + 1;
+
+  /* generates the sequence 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18
+   * , 20,21,22,23 ,4,9,14,19 ,24. To be used to shuffle the coeff data,
+   * So that last coeff from first 4 rows of coeffs can be used as one
+   * 32 bit element and make use of quad multiplier outside the inner-
+   * most loop.
+   * c11, c12, c13, c14, c15
+   * c21, c22, c23, c24, c25
+   * c31, c32, c33, c34, c35
+   * c41, c42, c43, c44, c45
+   * c51, c52, c53, c54, c55
+   *
+   * c15, c25, c35, c45 and c55 are placed in contigous fashion, so that
+   * c15, c25, c35, and c45 can be used as one 32 bit element
+   */
+  xb_vec2Nx8 dvecIdx;
+  dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 15),
+                                      IVP_ADD2NX8U(IVP_SEQ2NX8(), 10),
+                                      IVP_SELI_INTERLEAVE_2_LO),
+                         IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 5), IVP_SEQ2NX8(),
+                                      IVP_SELI_INTERLEAVE_2_LO),
+                         IVP_SELI_INTERLEAVE_4_LO);
+
+  dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(
+                                        IVP_MOVNX16_FROMN_2X32(4 + (9 << 8) + (14 << 16) + \
+                                                               (19 << 24))),
+                                      IVP_ADD2NX8U(IVP_SEQ2NX8(), 20), IVP_SELI_INTERLEAVE_2_LO),
+                         dvecIdx, IVP_SELI_8B_PACK_16);
+
+  /* loop across output height is unrolled twice and loop across kernel width and height is
+     completely unrolled*/
+
+  for (x = 0; x < outW; x += vectorizationWidth)  /* Loop across output width */
+  {
+    int32_t remX = XT_MIN(vectorizationWidth, outW - x);
+
+    /* If (remX + kSizeEffU - 1) <= 2 * XCHAL_IVPN_SIMD_WIDTH,
+     * i.e. if the number of input data bytes corresponding to remX number of outputs
+     * is less than or equal to 2 * XCHAL_IVPN_SIMD_WIDTH, there is no need to load
+     * the next 64 input bytes*/
+    int32_t remXLoad = ((remX + dilatedkSizeU - 1) > 2 * XCHAL_IVPN_SIMD_WIDTH) ? 1 : 0;
+
+    for (y = 0; y < outH; y++) /* Loop across output height */
+    {
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+      /* initialize coeff and Bias data pointer*/
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 2)  /* Loop across Output depth */
+      {
+        /* In order to handle odd output depths */
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+        /* Load the bias values corresponding to two output channels */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+        dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+        dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        pdvecIn = (MORPH_IDT_2Nx8 *) pInput;
+
+        for (inCh = 0; inCh < numInCh; inCh++)  /* Loop across input channels */
+        {
+          /* vectors for coeff and input loads */
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+          xb_vec2Nx8 dvecInData11, dvecInData21, dvecInData31, dvecInData41, dvecInData51;
+          xb_vec2Nx8 dvecInData12, dvecInData22, dvecInData32, dvecInData42, dvecInData52;
+
+          /* load data from first input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData32, dvecInData31, dvecInData32, dvecInData31, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecInData32, dvecInData31, dvecInData32, dvecInData31, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data from 4th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData42, dvecInData41, dvecInData42, dvecInData41, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecInData42, dvecInData41, dvecInData42, dvecInData41, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data from 5th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, inDataPitch2 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH - dilationU * 4 * inDataPitch1);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData52, dvecInData51, dvecInData52, dvecInData51, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecInData52, dvecInData51, dvecInData52, dvecInData51, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load all the 5x5 coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+
+          /* Rearrange them so that 5 MUL4T,1 MULQ,1 MUL can be used to perform entire operation */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+
+          /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc12, 0, dvecInData12, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MUL4TA(dacc22, 0, dvecInData12, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, 0, dvecInData21, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc21, 0, dvecInData21, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+          MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+          /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, 0, dvecInData31, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc12, 0, dvecInData32, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+          MORPH_OP_MUL4TA(dacc21, 0, dvecInData31, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+          MORPH_OP_MUL4TA(dacc22, 0, dvecInData32, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+
+          /* Multiply and accumulate 4th set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, 0, dvecInData41, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+          MORPH_OP_MUL4TA(dacc12, 0, dvecInData42, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+
+          MORPH_OP_MUL4TA(dacc21, 0, dvecInData41, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+          MORPH_OP_MUL4TA(dacc22, 0, dvecInData42, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+
+          /* Multiply and accumulate 5th set of 4 coefficients for all the outputs */
+          MORPH_OP_MUL4TA(dacc11, 0, dvecInData51, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+          MORPH_OP_MUL4TA(dacc12, 0, dvecInData52, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+
+          MORPH_OP_MUL4TA(dacc21, 0, dvecInData51, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+          MORPH_OP_MUL4TA(dacc22, 0, dvecInData52, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+
+          /* Multiply and accumulate 6th set of 4 coefficients for all the outputs */
+          MORPH_OP_MULQA(dacc11, \
+                         IVP_SEL2NX8I(dvecInData41, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData31, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+          MORPH_OP_MULQA(dacc12, \
+                         IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData12, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+          MORPH_OP_MULQA(dacc21, \
+                         IVP_SEL2NX8I(dvecInData41, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData31, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+
+          MORPH_OP_MULQA(dacc22, \
+                         IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_SEL2NX8I(dvecInData12, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4),
+                         IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+
+          /* Multiply and accumulate the final coefficient for all the outputs */
+          MORPH_OP_MULA(dacc11, IVP_SEL2NX8I(dvecInData51, dvecInData51, \
+                                             IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTR2NX8(dvecCoeffData1, 24));
+          MORPH_OP_MULA(dacc12, IVP_SEL2NX8I(dvecInData52, dvecInData52, \
+                                             IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTR2NX8(dvecCoeffData1, 24));
+
+          MORPH_OP_MULA(dacc21, IVP_SEL2NX8I(dvecInData51, dvecInData51, \
+                                             IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTR2NX8(dvecCoeffData2, 24));
+          MORPH_OP_MULA(dacc22, IVP_SEL2NX8I(dvecInData52, dvecInData52, \
+                                             IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTR2NX8(dvecCoeffData2, 24));
+        }  /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        if (!typeFlag)
+        {
+          MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                         IVP_DSELI_8B_INTERLEAVE_1);
+          MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                         IVP_DSELI_8B_INTERLEAVE_1);
+          MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                         IVP_DSELI_8B_INTERLEAVE_1);
+          MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                         IVP_DSELI_8B_INTERLEAVE_1);
+        }
+        else
+        {
+          MORPH_OP_DSELI(dvecOut1H, dvecOut1L, dvecOut1H, dvecOut1L, \
+                         IVP_DSELI_INTERLEAVE_1);
+          MORPH_OP_DSELI(dvecOut2H, dvecOut2L, dvecOut2H, dvecOut2L, \
+                         IVP_DSELI_INTERLEAVE_1);
+          MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                         IVP_DSELI_INTERLEAVE_2);
+          MORPH_OP_DSELI(dvecOut2H, dvecOut1H, dvecOut2H, dvecOut1H, \
+                         IVP_DSELI_INTERLEAVE_2);
+          MORPH_OP_DSELI(dvecOut3H, dvecOut3L, dvecOut3H, dvecOut3L, \
+                         IVP_DSELI_INTERLEAVE_1);
+          MORPH_OP_DSELI(dvecOut4H, dvecOut4L, dvecOut4H, dvecOut4L, \
+                         IVP_DSELI_INTERLEAVE_1);
+          MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                         IVP_DSELI_INTERLEAVE_2);
+          MORPH_OP_DSELI(dvecOut4H, dvecOut3H, dvecOut4H, dvecOut3H, \
+                         IVP_DSELI_INTERLEAVE_2);
+        }
+
+        /* Storing the first output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remX - \
+                       2 * XCHAL_IVPN_SIMD_WIDTH);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the second output depth, first row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remX * enable2ndCh);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * remX) - \
+                                                        2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 2 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 2 * coeffPitch3;
+      } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/
+    }   /* end of for (y = 0; y < outH; y++)*/
+  }     /* end of for (x = 0; x < outW; x += 2 * vectorizationWidth)*/
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+*   xaiConvolved(VQ)3D_S_7x7j1d1I8S8IX_MOW_WHD
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 7x7 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 7x7 3D dilated convolution function and 7x7 */
+/*               3D VQ dilated convolution function for U8 bit and S8 bit     */
+/*               input data with input stride equal to 1                      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 7x7xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/******************************************************************************************
+* 7x7 MOW WHD Stride 1 - DEPTH 3                                                          *
+* If number of input channels is equal to 3                                               *
+* this function is called.                                                                *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_7x7j1d1), S8IX_MOW_WHD_DEPTH3) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 * restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+
+  /* Variable Declarations */
+  int32_t outCh, x, y;
+  int32_t varLen;
+
+  /* In order to make the loop multiply-bound we are reducing the vectorization width
+     by extra values required for the kernel */
+  const int32_t vectorizationWidth = ((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1;
+
+  /* Vectorization width taken is 122. Two loads from same row
+   * Loop across output height is unrolled twice.
+   * Thus a single iteration produces 4 output vector.
+   * Input channels , kernel width and kernel height
+   * are completely unrolled.
+   */
+  for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */
+  {
+    /* In order to handle cases where input width <= 64, where
+     * the 2nd load from the same row needs to be avoided.  */
+    varLen = XT_MIN(vectorizationWidth, outW - x);
+    int32_t enable2ndCol = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, varLen + kSizeU - 1);
+
+    for (y = 0; y < outH; y += 2)  /* Loop across Output height */
+    {
+      /* In order to handle odd output height */
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+      /* initialize coeff and Bias data pointer*/
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh++)  /* Loop across Output depth */
+      {
+        /* load and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+        dacc11 = dacc12 = dacc21 = dacc22 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc21, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc22, hvecBias1, hvecBias1);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecIn = (MORPH_IDT_2Nx8 *) pInput;
+
+        /* 2 4-tap multipliers are used to accumulate 1 wide vector
+         * first 4-tap multiplier makes use of first 4 coeff across
+         * the kernel width. next 4 tap mulplier makes use last 3
+         * coeff across the kernel width, and 4th byte is zero
+         */
+
+        MORPH_IDT_2Nx8 dvecInData11, dvecInData21, dvecInData31, dvecInData41, \
+                       dvecInData51, dvecInData61, dvecInData71, dvecInData81;
+
+        MORPH_IDT_2Nx8 dvecInData12, dvecInData22, dvecInData32, dvecInData42, \
+                       dvecInData52, dvecInData62, dvecInData72, dvecInData82;
+
+        /** Input Channel 1 **/
+        /* load data from first input row */
+        valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 2nd input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 3rd input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 4th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 5th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 6th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 7th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData71, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData72, vaInData, pdvecIn, \
+                           inDataPitch1 * enable2ndRow - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 8th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData81, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData82, vaInData, pdvecIn,                 \
+                           inDataPitch2 - (6 + enable2ndRow) * inDataPitch1 \
+                           - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load first row of the coeff */
+        xb_vec2Nx8 dvecCoeffData1;
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with first coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData12, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 2nd row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 2nd coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData32, dvecInData31, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData32, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 3rd row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 3rd coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData32, dvecInData31, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData32, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData42, dvecInData41, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData42, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 4th row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 4th coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData42, dvecInData41, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData42, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData52, dvecInData51, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData52, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 5th row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 5th coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData52, dvecInData51, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData52, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData62, dvecInData61, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData62, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 6th row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 6th coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData62, dvecInData61, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData62, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData72, dvecInData71, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData72, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 7th row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+        /* Multiply input vectors with 7th coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData72, dvecInData71, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData72, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData82, dvecInData81, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData82, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData82, dvecInData81, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData82, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData82, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /** Input Channel 2 **/
+        /* load data from 1st input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 2nd input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 3rd input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 4th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 5th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+
+        /* load data from 6th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 7th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData71, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData72, vaInData, pdvecIn, \
+                           inDataPitch1 * enable2ndRow - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 8th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData81, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData82, vaInData, pdvecIn,                 \
+                           inDataPitch2 - (6 + enable2ndRow) * inDataPitch1 \
+                           - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load first row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with first coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData12, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 2nd row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 2nd coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData32, dvecInData31, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData32, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 3rd row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 3rd coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData32, dvecInData31, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData32, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData42, dvecInData41, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData42, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 4th row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 4th coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData42, dvecInData41, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData42, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData52, dvecInData51, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData52, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 5th row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 5th coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData52, dvecInData51, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData52, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData62, dvecInData61, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData62, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 6th row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 6th coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData62, dvecInData61, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData62, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData72, dvecInData71, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData72, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 7th row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+        /* Multiply input vectors with 7th coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData72, dvecInData71, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData72, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData82, dvecInData81, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData82, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData82, dvecInData81, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData82, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData82, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /** Input Channel 3 **/
+        /*load data from 1st input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 2nd input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 3rd input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 4th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 5th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+
+        /* load data from 6th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, \
+                           inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 7th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData71, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData72, vaInData, pdvecIn, \
+                           inDataPitch1 * enable2ndRow - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load data from 8th input row */
+        vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData81, vaInData, pdvecIn, \
+                           enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+        MORPH_OP_LOAD_2Nx8(dvecInData82, vaInData, pdvecIn,                 \
+                           inDataPitch2 - (6 + enable2ndRow) * inDataPitch1 \
+                           - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+        /* load first row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with first coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData12, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 2nd row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 2nd coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData32, dvecInData31, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData32, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 3rd row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 3rd coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData32, dvecInData31, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData32, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData42, dvecInData41, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData42, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 4th row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 4th coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData42, dvecInData41, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData42, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData52, dvecInData51, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData52, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 5th row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 5th coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData52, dvecInData51, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData52, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData62, dvecInData61, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData62, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 6th row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 6th coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData62, dvecInData61, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData62, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData72, dvecInData71, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData72, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 7th row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+        /* Multiply input vectors with 7th coeff row */
+        MORPH_OP_MUL4TA(dacc11, dvecInData72, dvecInData71, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc12, 0, dvecInData72, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc21, dvecInData82, dvecInData81, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData82, IVP_SELI_8B_ROTATE_RIGHT_4),
+                        IVP_SEL2NX8I(dvecInData82, dvecInData81, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        MORPH_OP_MUL4TA(dacc22, 0, dvecInData82, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData82, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* 1st row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * (varLen - 2 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - 3 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* 2nd row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * \
+                       (varLen - 2 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \
+                       2 * (varLen - 3 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += outDataPitch2 * bytesPerPixel;
+        pCoeff  += coeffPitch3;
+      } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/
+    }   /* end of for (y = 0; y < outH; y += 2)*/
+  }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+}
+
+/******************************************************************************************
+* 7x7 MOW fold 32 Stride 1                                                                *
+* If inDataPitch1 is lesser than or equal to                                              *
+* 32 this function is called.                                                             *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_7x7j1d1), S8IX_MOW_WHD_FOLD32) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 * restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, y;
+
+  /* Select sequence to re-arrange input data */
+  xb_vec2Nx8 dvecSeq = 0;
+  IVP_ADD2NX8T(dvecSeq, IVP_SEQ2NX8(), inDataPitch1, IVP_LT2NX8(IVP_SEQ2NX8(), inDataPitch1));
+  IVP_ADD2NX8T(dvecSeq, IVP_SUB2NX8(IVP_SEQ2NX8(), inDataPitch1), 64, \
+               IVP_NOTB2N(IVP_LT2NX8(IVP_SEQ2NX8(), inDataPitch1)));
+
+  /* loop across output height is unrolled twice
+   * to produce two output vectors in 1 iteration
+   */
+  for (y = 0; y < outH; y += 2)  /* Loop across Output height */
+  {
+    /* In order to handle odd output height */
+    int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+    /* initialize output data pointer */
+    int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel];
+
+    /* initialize coeff and Bias data pointer*/
+    int8_t *pCoeff = &pCoeffData[0];
+    int32_t *pBias = &pBiasData[0];
+
+    for (outCh = 0; outCh < numOutCh; outCh++)  /* Loop across Output depth */
+    {
+      /* load and replicate bias data */
+      xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4);
+
+      /* wide vectors(accumulators) initialized with bias */
+      xb_vec2Nx24 dacc1, dacc2;
+      dacc1 = dacc2 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+      IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+      IVP_CVT24UNX32H(dacc2, hvecBias1, hvecBias1);
+
+      /* priming of coeff load is done outside the innermost loop*/
+      pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+      valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y];
+
+      for (inCh = 0; inCh < numInCh; inCh++)  /* Loop across input depth */
+      {
+        /* 2 4-tap multipliers are used to accumulate 1 wide vector
+         * first 4-tap multiplier makes use of first 4 coeff across
+         * the kernel width. next 4 tap mulplier makes use last 3
+         * coeff across the kernel width, and 4th byte is zero
+         */
+
+        xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4;
+
+        /* load data from first 2 input rows */
+        pdvecIn = (MORPH_IDT_2Nx8 *) pInput;
+        valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+        MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, 2 * inDataPitch1);
+
+        /* load data from next 2 input rows */
+        MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn, 2 * inDataPitch1);
+
+        /* load data from next 2 input rows */
+        MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn, 2 * inDataPitch1);
+
+        /* load data from next 2 input rows */
+        MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData4, vaInData, pdvecIn, (1 + enable2ndRow) * inDataPitch1);
+        pInput += inDataPitch2;
+
+        /* load first row of the coeff */
+        xb_vec2Nx8 dvecCoeffData1;
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        xb_vec2Nx8 dvecTemp1, dvecTemp2, dvecTemp3;
+        dvecTemp1 = IVP_SEL2NX8(dvecInData2, dvecInData1, dvecSeq);
+        dvecTemp2 = IVP_SEL2NX8(dvecInData3, dvecInData2, dvecSeq);
+        dvecTemp3 = IVP_SEL2NX8(dvecInData4, dvecInData3, dvecSeq);
+
+        /* Multiply input vectors with first coeff row */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecInData1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+
+        /* load 2nd row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 2nd coeff row */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecTemp1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecTemp1, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+
+        /* load 3rdrow of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 3rd coeff row */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecInData2, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+
+        /* load 4th row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 4th coeff row */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecTemp2, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecTemp2, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* load 5th row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 5th coeff row */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecInData3, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+
+        /* load 6th row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 6th coeff row */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecTemp3, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecTemp3, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+
+
+        /* load 7th row of the coeff */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+        /* Multiply input vectors with 7th coeff row */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecInData4, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData4, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                        IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+      }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+      /* Pack, Output Scale, Output Shift and clamping */
+      xb_vec2Nx8 dvecOut1L;
+      xb_vec2Nx8 dvecOut1H;
+#if DILATED_VQ_CONV == VQ_TRUE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+      PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                    outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+      /* Storing the first output depth, first row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput);
+      valign vaOutData = IVP_ZALIGN();
+      IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      /* Storing the first output depth, second row */
+      pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+      IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \
+                     vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW);
+      IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+      pOutput += outDataPitch2 * bytesPerPixel;
+      pCoeff  += coeffPitch3;
+    } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/
+  }   /* end of for (y = 0; y < outH; y += 2)*/
+}
+
+/****************** xaiConvolvedVQ3D_S_7x7j1d1_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_7x7j1d1_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_7x7j1d1_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_7x7j1d1_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_7x7j1d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 7);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                           \
+                    XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE3D_EDGE(inTile, 3);
+    XAI_CHECK_STRIDE(param, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Call DEPTH3 varinat if input depth =3 */
+  if (XAI_TILE3D_GET_DIM3(inTile) == 3)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_7x7j1d1), S8IX_MOW_WHD_DEPTH3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+  /* check inDataPitch1, if it is less than or equal to 32,
+   * call FOLD32 variant
+   */
+  if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 32)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_7x7j1d1), S8IX_MOW_WHD_FOLD32) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+
+  /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 * restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+  int32_t varLen;
+
+  /* In order to make the loop multiply-bound we are reducing the vectorization width
+      by extra values required for the kernel */
+  const int32_t vectorizationWidth58  = ((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1;
+  const int32_t vectorizationWidth122 = ((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1;
+
+  /* loop across output height is unrolled twice.
+   * Loop across kernel width and height is
+   * completely unrolled.
+   * 128 bytes of input are loaded.
+   */
+  for (x = 0; x < outW - vectorizationWidth58; x += vectorizationWidth122)  /* Loop across Output width */
+  {
+    varLen = XT_MIN(vectorizationWidth122, outW - x);
+    int32_t enable2ndCol = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, varLen + kSizeU - 1);
+    for (y = 0; y < outH; y += 2)   /* Loop across Output height */
+    {
+      /* In order to handle odd output height */
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+      /* initialize coeff and Bias data pointer*/
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh++)   /* Loop across Output depth */
+      {
+        /* load and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+        dacc11 = dacc12 = dacc21 = dacc22 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc21, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc22, hvecBias1, hvecBias1);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecIn = (MORPH_IDT_2Nx8 *) pInput;
+
+        for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input depth */
+        {
+          /* 2 4-tap multipliers are used to accumulate 1 wide vector
+           * first 4-tap multiplier makes use of first 4 coeff across
+           * the kernel width. next 4 tap mulplier makes use last 3
+           * coeff across the kernel width, and 4th byte is zero
+           */
+
+          MORPH_IDT_2Nx8 dvecInData11, dvecInData21, dvecInData31, dvecInData41, dvecInData51, \
+                         dvecInData61, dvecInData71, dvecInData81;
+
+          MORPH_IDT_2Nx8 dvecInData12, dvecInData22, dvecInData32, dvecInData42, dvecInData52, \
+                         dvecInData62, dvecInData72, dvecInData82;
+
+          /* load data from first input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, \
+                             enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \
+                             inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* load data from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, \
+                             enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \
+                             inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* load data from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, \
+                             enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \
+                             inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* load data from 4th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, \
+                             enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \
+                             inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* load data from 5th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, \
+                             enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \
+                             inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* load data from 6th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, \
+                             enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, \
+                             inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* load data from 7th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData71, vaInData, pdvecIn, \
+                             enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData72, vaInData, pdvecIn, \
+                             inDataPitch1 * enable2ndRow      \
+                             - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* load data from 8th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData81, vaInData, pdvecIn, \
+                             enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData82, vaInData, pdvecIn,                 \
+                             inDataPitch2 - (6 + enable2ndRow) * inDataPitch1 \
+                             - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* load first row of the coeff */
+          xb_vec2Nx8 dvecCoeffData1;
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply input vectors with first coeff row */
+          MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4),
+                          IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc12, 0, dvecInData12, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc21, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                          IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 2nd row of the coeff */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply input vectors with 2nd coeff row */
+          MORPH_OP_MUL4TA(dacc11, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4),
+                          IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc21, dvecInData32, dvecInData31, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                          IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc22, 0, dvecInData32, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 3rd row of the coeff */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply input vectors with 3rd coeff row */
+          MORPH_OP_MUL4TA(dacc11, dvecInData32, dvecInData31, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4),
+                          IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc12, 0, dvecInData32, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc21, dvecInData42, dvecInData41, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                          IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc22, 0, dvecInData42, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 4th row of the coeff */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply input vectors with 4th coeff row */
+          MORPH_OP_MUL4TA(dacc11, dvecInData42, dvecInData41, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4),
+                          IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc12, 0, dvecInData42, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc21, dvecInData52, dvecInData51, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4),
+                          IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc22, 0, dvecInData52, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 5th row of the coeff */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply input vectors with 5th coeff row */
+          MORPH_OP_MUL4TA(dacc11, dvecInData52, dvecInData51, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4),
+                          IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc12, 0, dvecInData52, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+
+          MORPH_OP_MUL4TA(dacc21, dvecInData62, dvecInData61, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4),
+                          IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc22, 0, dvecInData62, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 6th row of the coeff */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply input vectors with 6th coeff row */
+          MORPH_OP_MUL4TA(dacc11, dvecInData62, dvecInData61, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4),
+                          IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc12, 0, dvecInData62, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc21, dvecInData72, dvecInData71, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4),
+                          IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc22, 0, dvecInData72, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 7th row of the coeff */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+          /* Multiply input vectors with 7th coeff row */
+          MORPH_OP_MUL4TA(dacc11, dvecInData72, dvecInData71, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4),
+                          IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc12, 0, dvecInData72, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc21, dvecInData82, dvecInData81, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData82, IVP_SELI_8B_ROTATE_RIGHT_4),
+                          IVP_SEL2NX8I(dvecInData82, dvecInData81, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc22, 0, dvecInData82, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData82, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+        }    /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* 1st row  */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * (varLen - 2 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - 3 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* 2nd row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * \
+                       (varLen - 2 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \
+                       2 * (varLen - 3 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += outDataPitch2 * bytesPerPixel;
+        pCoeff  += coeffPitch3;
+      }  /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/
+    }    /* end of for (y = 0; y < outH; y += 2)*/
+  }      /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+
+
+  /* To handle cases where the remaining output width is less than or equal to 58.
+   * loop across output height is unrolled twice. Loop across kernel width and height is
+   * completely unrolled. 64 bytes of input are loaded.
+   */
+  if (x < outW)
+  {
+    for (y = 0; y < outH; y += 2)  /* Loop across Output height */
+    {
+      /* In order to handle odd output height */
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+      /* initialize coeff and Bias data pointer*/
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh++)  /* Loop across Output depth */
+      {
+        /* load and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2;
+        dacc1 = dacc2 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc2, hvecBias1, hvecBias1);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecIn = (MORPH_IDT_2Nx8 *) pInput;
+
+        for (inCh = 0; inCh < numInCh; inCh++)  /* Loop across input depth */
+        {
+          /* 2 4-tap multipliers are used to accumulate 1 wide vector
+           * first 4-tap multiplier makes use of first 4 coeff across
+           * the kernel width. next 4 tap mulplier makes use last 3
+           * coeff across the kernel width, and 4th byte is zero
+           */
+
+          xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, dvecInData5, \
+                     dvecInData6, dvecInData7, dvecInData8;
+
+          /* load data from first input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, inDataPitch1);
+
+          /* load data from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn, inDataPitch1);
+
+          /* load data from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn, inDataPitch1);
+
+          /* load data from 4th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData4, vaInData, pdvecIn, inDataPitch1);
+
+          /* load data from 5th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData5, vaInData, pdvecIn, inDataPitch1);
+
+          /* load data from 6th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData6, vaInData, pdvecIn, inDataPitch1);
+
+          /* load data from 7th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData7, vaInData, pdvecIn, inDataPitch1 * enable2ndRow);
+
+          /* load data from 8th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData8, vaInData, pdvecIn, inDataPitch2 - (6 + enable2ndRow) * inDataPitch1);
+
+          /* load first row of the coeff */
+          xb_vec2Nx8 dvecCoeffData1;
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply input vectors with first coeff row */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData2, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(0, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 2nd row of the coeff */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply input vectors with 2nd coeff row */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData2, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData3, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(0, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 3rdrow of the coeff */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply input vectors with 3rd coeff row */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData3, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData4, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(0, dvecInData4, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 4th row of the coeff */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply input vectors with 4th coeff row */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData4, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData4, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData5, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(0, dvecInData5, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 5th row of the coeff */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply input vectors with 5th coeff row */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData5, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData5, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData6, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(0, dvecInData6, IVP_SELI_8B_ROTATE_RIGHT_4),
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 6th row of the coeff */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply input vectors with 6th coeff row */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData6, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData6, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData7, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(0, dvecInData7, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 7th row of the coeff */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply input vectors with 7th coeff row */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData7, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData7, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData8, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(0, dvecInData8, IVP_SELI_8B_ROTATE_RIGHT_4), \
+                          IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+        }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable length for output stores */
+        varLen = XT_MIN(vectorizationWidth58, outW - x);
+
+        /* Storing the first depth output */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* first depth , 2nd row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \
+                       2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += outDataPitch2 * bytesPerPixel;
+        pCoeff  += coeffPitch3;
+      } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/
+    }   /* end of for (y = 0; y < outH; y += 2)*/
+  }     /* end of if(x < outW)*/
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+* 7x7 MOW WHD Stride 2 - DEPTH 3                                                          *
+* If number of input channels is equal to 3                                               *
+* this function is called.                                                                *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_7x7j2d1), S8IX_MOW_WHD_DEPTH3) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Pitches of Coefficient Data (WHDN) dim3 */
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8* restrict pdvecIn1;
+  MORPH_IDT_2Nx8* restrict pdvecIn2;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t outCh, x, y;
+  int32_t varLen;
+  /* No. of output elements that can be processed from 2 input loads */
+  const int32_t vectorizationWidth = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1;
+
+  /* generates the sequence to shuffle the coeff */
+  /* 0 1 2 3 4 5 6 7  7 8 9 10 11 12 13 14  8 9 10 11 12 13 14 15 .. */
+  xb_vec2Nx8 dvecShflIdx = IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 7), \
+                                        IVP_SEQ2NX8(), IVP_SELI_8B_INTERLEAVE_8_LO);
+  /* 0 2 4 6 8 10 12 14 14 16 18 20 22 24 26 28  28 30 32 34 36 38 40 42 42 44 46 48 50 52 ...  */
+  dvecShflIdx = IVP_SEL2NX8I(IVP_SLLI2NX8(IVP_ADD2NX8U(dvecShflIdx, 14), 1), \
+                             IVP_SLLI2NX8(dvecShflIdx, 1), IVP_SELI_8B_PACK_16);
+  /* Assuming that 50th index will have zero values */
+  /* Final shuffle index pattern will be
+      0  2  4  6   1  3  5 50   8 10 12 50   7 9 11 13
+     14 16 18 20  15 17 19 50  22 24 26 50  21 23 25 27
+     28 30 32 34  29 31 33 50  36 38 40 50  35 37 39 41
+     42 44 46 48  43 45 47 50
+   */
+  dvecShflIdx = IVP_SEL2NX8I(
+    IVP_MOV2NX8T(50, IVP_ADD2NX8(dvecShflIdx, IVP_SEL2NX8I(-1, 1, IVP_SELI_8B_INTERLEAVE_4_LO)),
+                 IVP_EQ2NX8(IVP_AND2NX8(IVP_SEQ2NX8(), 7), 3)),
+    IVP_MOV2NX8T(50, dvecShflIdx, IVP_EQ2NX8(IVP_AND2NX8(IVP_SEQ2NX8(), 7), 7)),
+    IVP_SELI_8B_INTERLEAVE_4_LO);
+
+  /* The inner most loop runs across the kernel height and produces
+   * 4 output vectors - 2 output rows from 2 output channels. Unrolling across the output
+   * channels by 2 helps in re-using the already loaded input data. Unrolling across the
+   * output height by 2 helps in re-using the already loaded coeff data.
+   * The coefficients are arranged in such a way that MORPH_OP_MUL4TA can be used.
+   */
+  for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */
+  {
+    for (y = 0; y < outH; y += 2) /* Loop across output height */
+    {
+      /* In order to handle odd output heights */
+      int32_t enable2Row = XT_SALT(y, outH - 1);
+
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)];
+
+      /* initialize coeff data pointer and bias data pointer to outCh kernel */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across output channels */
+      {
+        /* In order to handle odd output depths */
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+        /* Initialize the acc for 1st and 2nd output row of 1st channel with bias data */
+        xb_vec2Nx24 dacc1;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        /* Initialize the acc for 1st and 2nd output row of 2nd channel with bias data */
+        xb_vec2Nx24 dacc2;
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+
+        /* priming for coeff loads */
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+        pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+        xb_vec2Nx8 dvecI0, dvecI1;
+        /******************************** 1st inCh **************************************/
+        /* Load vectors from first row */
+        valign vaInData1; vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        xb_vec2Nx8 dvecInData1; IVP_LA2NX8_XP(dvecInData1, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* Load vectors from 2nd row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        xb_vec2Nx8 dvecInData2; IVP_LA2NX8_XP(dvecInData2, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* Load vectors from 3rd row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        xb_vec2Nx8 dvecInData3; IVP_LA2NX8_XP(dvecInData3, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* Load vectors from 4th row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        xb_vec2Nx8 dvecInData4; IVP_LA2NX8_XP(dvecInData4, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* Load vectors from 5th row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        xb_vec2Nx8 dvecInData5; IVP_LA2NX8_XP(dvecInData5, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* Load vectors from 6th row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        xb_vec2Nx8 dvecInData6; IVP_LA2NX8_XP(dvecInData6, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* Load vectors from 7th row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        xb_vec2Nx8 dvecInData7; IVP_LA2NX8_XP(dvecInData7, vaInData1, pdvecIn1, \
+                                              inDataPitch1 * enable2Row);
+
+        /* Load vectors from 8th row, to be used by 2nd output height */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        xb_vec2Nx8 dvecInData8; IVP_LA2NX8_XP(dvecInData8, vaInData1, pdvecIn1, \
+                                              inDataPitch1 * enable2Row);
+
+        /* Load vectors from 9th row, to be used by 2nd output height */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        xb_vec2Nx8 dvecInData9; IVP_LA2NX8_XP(dvecInData9, vaInData1, pdvecIn1, \
+                                              inDataPitch2 - (6 + 2 * enable2Row) * inDataPitch1);
+
+        /* Load the 7x7 coefficients for 2 output channels */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+
+        /* rearrange the coeff in desired format, so that MUL4T can be used */
+        dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecShflIdx);
+        dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecShflIdx);
+
+        /* Re-arrange the data in the desired format                              */
+        /* Assume input as 0,1,2, .. 63 for two rows                              */
+        /* After re-arrangement using DSEL operation, updated vectors would be    */
+        /* dvecI0 : 0, 2, 4, ...                                                  */
+        /* dvecI1 : 1, 3, 5, ...                                                  */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+        /* Mulitply 1st row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* Mulitply 1st row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+        /* Mulitply 2nd row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+        /* Mulitply 2nd row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+        /* Mulitply 3rd row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+        /* Mulitply 3rd row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+        /* Mulitply 4th row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6));
+
+        /* Mulitply 4th row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6));
+
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+        /* Mulitply 5th row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9));
+
+        /* Mulitply 5th row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9));
+
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData8, dvecInData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+        /* Mulitply 6th row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 11));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 10));
+
+        /* Mulitply 6th row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 11));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 10));
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData9, dvecInData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+        /* Mulitply 7th row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 12));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 13));
+
+        /* Mulitply 7th row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 12));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 13));
+
+        /***************************** 2nd inCh **********************************************/
+        /* Load vectors from first row */
+
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData1, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* Load vectors from 2nd row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData2, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* Load vectors from 3rd row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData3, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* Load vectors from 4th row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData4, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* Load vectors from 5th row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData5, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* Load vectors from 6th row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData6, vaInData1, pdvecIn1, inDataPitch1);
+
+        /* Load vectors from 7th row */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData7, vaInData1, pdvecIn1, inDataPitch1 * enable2Row);
+
+        /* Load vectors from 8th row, to be used by 2nd output height */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_XP(dvecInData8, vaInData1, pdvecIn1, inDataPitch1 * enable2Row);
+
+        /* Load vectors from 9th row, to be used by 2nd output height */
+        vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+        IVP_LA2NX8_IP(dvecInData9, vaInData1, pdvecIn1);
+
+        /* Load the 7x7 coefficients for 2 output channels */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+
+        /* rearrange the coeff in desired format, so that MUL4T can be used */
+        dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecShflIdx);
+        dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecShflIdx);
+
+        /* Re-arrange the data in the desired format                              */
+        /* Assume input as 0,1,2, .. 63 for two rows                              */
+        /* After re-arrangement using DSEL operation, updated vectors would be    */
+        /* dvecI0 : 0, 2, 4, ...                                                  */
+        /* dvecI1 : 1, 3, 5, ...                                                  */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+        /* Mulitply 1st row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* Mulitply 1st row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+        /* Mulitply 2nd row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+        /* Mulitply 2nd row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+        /* Mulitply 3rd row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+        /* Mulitply 3rd row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+        /* Mulitply 4th row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6));
+
+        /* Mulitply 4th row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6));
+
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+        /* Mulitply 5th row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9));
+
+        /* Mulitply 5th row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9));
+
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData8, dvecInData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+        /* Mulitply 6th row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 11));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 10));
+
+        /* Mulitply 6th row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 11));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 10));
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData9, dvecInData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+        /* Mulitply 7th row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 12));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 13));
+
+        /* Mulitply 7th row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 12));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 13));
+
+        /******************************* 3rd inCh *********************************************/
+        /* Load vectors from first row */
+        valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        IVP_LA2NX8_XP(dvecInData1, vaInData2, pdvecIn2, inDataPitch1);
+
+        /* Load vectors from 2nd row */
+        vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        IVP_LA2NX8_XP(dvecInData2, vaInData2, pdvecIn2, inDataPitch1);
+
+        /* Load vectors from 3rd row */
+        vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        IVP_LA2NX8_XP(dvecInData3, vaInData2, pdvecIn2, inDataPitch1);
+
+        /* Load vectors from 4th row */
+        vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        IVP_LA2NX8_XP(dvecInData4, vaInData2, pdvecIn2, inDataPitch1);
+
+        /* Load vectors from 5th row */
+        vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        IVP_LA2NX8_XP(dvecInData5, vaInData2, pdvecIn2, inDataPitch1);
+
+        /* Load vectors from 6th row */
+        vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        IVP_LA2NX8_XP(dvecInData6, vaInData2, pdvecIn2, inDataPitch1);
+
+        /* Load vectors from 7th row */
+        vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        IVP_LA2NX8_XP(dvecInData7, vaInData2, pdvecIn2, inDataPitch1 * enable2Row);
+
+        /* Load vectors from 8th row, to be used by 2nd output height */
+        vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        IVP_LA2NX8_XP(dvecInData8, vaInData2, pdvecIn2, inDataPitch1 * enable2Row);
+
+        /* Load vectors from 9th row, to be used by 2nd output height */
+        vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+        IVP_LA2NX8_IP(dvecInData9, vaInData2, pdvecIn2);
+
+        /* Load the 7x7 coefficients for 2 output channels */
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+
+        /* rearrange the coeff in desired format, so that MUL4T can be used */
+        dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecShflIdx);
+        dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecShflIdx);
+
+        /* Re-arrange the data in the desired format                              */
+        /* Assume input as 0,1,2, .. 63 for two rows                              */
+        /* After re-arrangement using DSEL operation, updated vectors would be    */
+        /* dvecI0 : 0, 2, 4, ...                                                  */
+        /* dvecI1 : 1, 3, 5, ...                                                  */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+        /* Mulitply 1st row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+        /* Mulitply 1st row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+        /* Mulitply 2nd row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+        /* Mulitply 2nd row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+        /* Mulitply 3rd row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+        /* Mulitply 3rd row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+        /* Mulitply 4th row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6));
+
+        /* Mulitply 4th row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6));
+
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+        /* Mulitply 5th row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9));
+
+        /* Mulitply 5th row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9));
+
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData8, dvecInData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+        /* Mulitply 6th row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 11));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 10));
+
+        /* Mulitply 6th row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 11));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 10));
+
+        /* rearrange input vectors */
+        IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData9, dvecInData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+        /* Mulitply 7th row with coeff from 1st output channel */
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 12));
+        MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 13));
+
+        /* Mulitply 7th row with coeff from 2nd output channel */
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 12));
+        MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                          (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 13));
+
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh ], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable length for output stores */
+        varLen = XT_MIN(vectorizationWidth, outW - x);
+
+        /* Storing the 1st row output from 1st channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row output from 1st channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut1L, dvecOut1L, IVP_SELI_EXTRACT_HI_HALVES), vaOutData, \
+                       pdvecOut, (-typeFlag + 1) * varLen * enable2Row);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2Row);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 1st row output from 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, enable2ndCh * bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row output from 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch1 * enable2Row + \
+                                              outDataPitch2 * enable2ndCh) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut2L, dvecOut2L, IVP_SELI_EXTRACT_HI_HALVES), vaOutData, \
+                       pdvecOut, enable2ndCh * (-typeFlag + 1) * varLen * enable2Row);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, enable2ndCh * typeFlag * 2 * varLen * \
+                       enable2Row);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 2 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 2 * coeffPitch3;
+      }   /* END for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+    }     /* END for (y = 0; y < outH ; y += 2)*/
+  }       /* END for (x = 0; x < outW; x += vectorizationWidth)*/
+}
+
+
+/******************************************************************************************
+*   xaiConvolved(VQ)3D_S_7x7j2d1I8S8IX_MOW_WHD
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 7x7 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 7x7 3D dilated convolution function and 7x7 */
+/*               3D VQ dilated convolution function for U8 bit and S8 bit     */
+/*               input data with input stride equal to 2                      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 7x7xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_7x7j2d1_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_7x7j2d1_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_7x7j2d1_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_7x7j2d1_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_7x7j2d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+//    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 7);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                           \
+                    XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE3D_EDGE(inTile, 3);
+    XAI_CHECK_STRIDE(param, 2);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  if (XAI_TILE3D_GET_DIM3(inTile) == 3)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_7x7j2d1), S8IX_MOW_WHD_DEPTH3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Pitches of Coefficient Data (WHDN) dim3 */
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8* restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+  int32_t varLen;
+  /* No. of output elements that can be processed from 2 input loads */
+  const int32_t vectorizationWidth = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1;
+
+  /* generates the sequence to shuffle the coeff */
+  /* 0 1 2 3 4 5 6 7  7 8 9 10 11 12 13 14  8 9 10 11 12 13 14 15 .. */
+  xb_vec2Nx8 dvecShflIdx = IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 7), \
+                                        IVP_SEQ2NX8(), IVP_SELI_8B_INTERLEAVE_8_LO);
+  /* 0 2 4 6 8 10 12 14 14 16 18 20 22 24 26 28  28 30 32 34 36 38 40 42 42 44 46 48 50 52 ...  */
+  dvecShflIdx = IVP_SEL2NX8I(IVP_SLLI2NX8(IVP_ADD2NX8U(dvecShflIdx, 14), 1), \
+                             IVP_SLLI2NX8(dvecShflIdx, 1), IVP_SELI_8B_PACK_16);
+  /* Assuming that 50th index will have zero values */
+  /* Final shuffle index pattern will be
+      0  2  4  6   1  3  5 50   8 10 12 50   7 9 11 13
+     14 16 18 20  15 17 19 50  22 24 26 50  21 23 25 27
+     28 30 32 34  29 31 33 50  36 38 40 50  35 37 39 41
+     42 44 46 48  43 45 47 50
+   */
+  dvecShflIdx = IVP_SEL2NX8I(
+    IVP_MOV2NX8T(50, IVP_ADD2NX8(dvecShflIdx, IVP_SEL2NX8I(-1, 1, IVP_SELI_8B_INTERLEAVE_4_LO)),
+                 IVP_EQ2NX8(IVP_AND2NX8(IVP_SEQ2NX8(), 7), 3)),
+    IVP_MOV2NX8T(50, dvecShflIdx, IVP_EQ2NX8(IVP_AND2NX8(IVP_SEQ2NX8(), 7), 7)),
+    IVP_SELI_8B_INTERLEAVE_4_LO);
+
+  /* The inner most loop runs across the kernel height and produces
+   * 4 output vectors - 2 output rows from 2 output channels. Unrolling across the output
+   * channels by 2 helps in re-using the already loaded input data. Unrolling across the
+   * output height by 2 helps in re-using the already loaded coeff data.
+   * The coefficients are arranged in such a way that MORPH_OP_MUL4TA can be used.
+   */
+  for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */
+  {
+    for (y = 0; y < outH - 1; y += 2) /* Loop across output height */
+    {
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)];
+
+      /* initialize coeff data pointer and bias data pointer to outCh kernel */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across output channels */
+      {
+        /* In order to handle odd output depths */
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+        /* Initialize the acc for 1st and 2nd output row of 1st channel with bias data */
+        xb_vec2Nx24 dacc1;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        /* Initialize the acc for 1st and 2nd output row of 2nd channel with bias data */
+        xb_vec2Nx24 dacc2;
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        pdvecIn = (MORPH_IDT_2Nx8 *) (pInput);
+
+        for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */
+        {
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+          xb_vec2Nx8 dvecI0, dvecI1;
+
+          /* Load vectors from first row */
+          valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          xb_vec2Nx8 dvecInData1; IVP_LA2NX8_XP(dvecInData1, vaInData, pdvecIn, inDataPitch1);
+
+          /* Load vectors from 2nd row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          xb_vec2Nx8 dvecInData2; IVP_LA2NX8_XP(dvecInData2, vaInData, pdvecIn, inDataPitch1);
+
+          /* Load vectors from 3rd row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          xb_vec2Nx8 dvecInData3; IVP_LA2NX8_XP(dvecInData3, vaInData, pdvecIn, inDataPitch1);
+
+          /* Load vectors from 4th row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          xb_vec2Nx8 dvecInData4; IVP_LA2NX8_XP(dvecInData4, vaInData, pdvecIn, inDataPitch1);
+
+          /* Load vectors from 5th row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          xb_vec2Nx8 dvecInData5; IVP_LA2NX8_XP(dvecInData5, vaInData, pdvecIn, inDataPitch1);
+
+          /* Load vectors from 6th row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          xb_vec2Nx8 dvecInData6; IVP_LA2NX8_XP(dvecInData6, vaInData, pdvecIn, inDataPitch1);
+
+          /* Load vectors from 7th row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          xb_vec2Nx8 dvecInData7; IVP_LA2NX8_XP(dvecInData7, vaInData, pdvecIn, inDataPitch1);
+
+          /* Load vectors from 8th row, to be used by 2nd output height */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          xb_vec2Nx8 dvecInData8; IVP_LA2NX8_XP(dvecInData8, vaInData, pdvecIn, inDataPitch1);
+
+          /* Load vectors from 9th row, to be used by 2nd output height */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          xb_vec2Nx8 dvecInData9; IVP_LA2NX8_XP(dvecInData9, vaInData, pdvecIn, \
+                                                inDataPitch2 - 8 * inDataPitch1);
+
+          /* Load the 7x7 coefficients for 2 output channels */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+
+          /* rearrange the coeff in desired format, so that MUL4T can be used */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecShflIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecShflIdx);
+
+          /* Re-arrange the data in the desired format                              */
+          /* Assume input as 0,1,2, .. 63 for two rows                              */
+          /* After re-arrangement using DSEL operation, updated vectors would be    */
+          /* dvecI0 : 0, 2, 4, ...                                                  */
+          /* dvecI1 : 1, 3, 5, ...                                                  */
+          IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* Mulitply 1st row with coeff from 1st output channel */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* Mulitply 1st row with coeff from 2nd output channel */
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+
+          /* rearrange input vectors */
+          IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+          /* Mulitply 2nd row with coeff from 1st output channel */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+          /* Mulitply 2nd row with coeff from 2nd output channel */
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+
+          /* rearrange input vectors */
+          IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* Mulitply 3rd row with coeff from 1st output channel */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+          /* Mulitply 3rd row with coeff from 2nd output channel */
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+
+          /* rearrange input vectors */
+          IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* Mulitply 4th row with coeff from 1st output channel */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7));
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6));
+
+          /* Mulitply 4th row with coeff from 2nd output channel */
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6));
+
+
+          /* rearrange input vectors */
+          IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+          /* Mulitply 5th row with coeff from 1st output channel */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9));
+
+          /* Mulitply 5th row with coeff from 2nd output channel */
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9));
+
+
+          /* rearrange input vectors */
+          IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData8, dvecInData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* Mulitply 6th row with coeff from 1st output channel */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 11));
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 10));
+
+          /* Mulitply 6th row with coeff from 2nd output channel */
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 11));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 10));
+
+          /* rearrange input vectors */
+          IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData9, dvecInData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* Mulitply 7th row with coeff from 1st output channel */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 12));
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 13));
+
+          /* Mulitply 7th row with coeff from 2nd output channel */
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 12));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 13));
+        }   /* END for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable length for output stores */
+        varLen = XT_MIN(vectorizationWidth, outW - x);
+
+        /* Storing the 1st row output from 1st channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row output from 1st channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut1L, dvecOut1L, IVP_SELI_EXTRACT_HI_HALVES), vaOutData, \
+                       pdvecOut, (-typeFlag + 1) * varLen);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 1st row output from 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, enable2ndCh * bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row output from 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch1 + \
+                                              outDataPitch2 * enable2ndCh) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut2L, dvecOut2L, IVP_SELI_EXTRACT_HI_HALVES), vaOutData, \
+                       pdvecOut, enable2ndCh * (-typeFlag + 1) * varLen);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, enable2ndCh * typeFlag * 2 * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 2 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 2 * coeffPitch3;
+      } /* END for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+    }   /* END for (y = 0; y < outH - 1; y += 2)*/
+    if (y < outH)
+    {
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)];
+
+      /* initialize coeff data pointer and bias data pointer to outCh kernel */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across output channels */
+      {
+        /* In order to handle odd output depths */
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+        /* Initialize the acc for 1st and 2nd output row of 1st channel with bias data */
+        xb_vec2Nx24 dacc1;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        /* Initialize the acc for 1st and 2nd output row of 2nd channel with bias data */
+        xb_vec2Nx24 dacc2;
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        pdvecIn = (MORPH_IDT_2Nx8 *) (pInput);
+
+        for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */
+        {
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2;
+          xb_vec2Nx8 dvecI0, dvecI1;
+
+          /* Load vectors from first row */
+          valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          xb_vec2Nx8 dvecInData1; IVP_LA2NX8_XP(dvecInData1, vaInData, pdvecIn, inDataPitch1);
+
+          /* Load vectors from 2nd row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          xb_vec2Nx8 dvecInData2; IVP_LA2NX8_XP(dvecInData2, vaInData, pdvecIn, inDataPitch1);
+
+          /* Load vectors from 3rd row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          xb_vec2Nx8 dvecInData3; IVP_LA2NX8_XP(dvecInData3, vaInData, pdvecIn, inDataPitch1);
+
+          /* Load vectors from 4th row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          xb_vec2Nx8 dvecInData4; IVP_LA2NX8_XP(dvecInData4, vaInData, pdvecIn, inDataPitch1);
+
+          /* Load vectors from 5th row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          xb_vec2Nx8 dvecInData5; IVP_LA2NX8_XP(dvecInData5, vaInData, pdvecIn, inDataPitch1);
+
+          /* Load vectors from 6th row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          xb_vec2Nx8 dvecInData6; IVP_LA2NX8_XP(dvecInData6, vaInData, pdvecIn, inDataPitch1);
+
+          /* Load vectors from 7th row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          xb_vec2Nx8 dvecInData7; IVP_LA2NX8_XP(dvecInData7, vaInData, pdvecIn, \
+                                                inDataPitch2 - 6 * inDataPitch1);
+
+          /* Load the 7x7 coefficients for 2 output channels */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+
+          /* rearrange the coeff in desired format, so that MUL4T can be used */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecShflIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecShflIdx);
+
+          /* Re-arrange the data in the desired format                              */
+          /* Assume input as 0,1,2, .. 63 for two rows                              */
+          /* After re-arrangement using DSEL operation, updated vectors would be    */
+          /* dvecI0 : 0, 2, 4, ...                                                  */
+          /* dvecI1 : 1, 3, 5, ...                                                  */
+          IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* Mulitply 1st row with coeff from 1st output channel */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* Mulitply 1st row with coeff from 2nd output channel */
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+
+          /* rearrange input vectors */
+          IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+          /* Mulitply 2nd row with coeff from 1st output channel */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+          /* Mulitply 2nd row with coeff from 2nd output channel */
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+
+          /* rearrange input vectors */
+          IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* Mulitply 3rd row with coeff from 1st output channel */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4));
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5));
+
+          /* Mulitply 3rd row with coeff from 2nd output channel */
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5));
+
+          /* rearrange input vectors */
+          IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* Mulitply 4th row with coeff from 1st output channel */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7));
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6));
+
+          /* Mulitply 4th row with coeff from 2nd output channel */
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6));
+
+
+          /* rearrange input vectors */
+          IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+          /* Mulitply 5th row with coeff from 1st output channel */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9));
+
+          /* Mulitply 5th row with coeff from 2nd output channel */
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9));
+
+
+          /* rearrange input vectors */
+          IVP_DSEL2NX8I(dvecI1, dvecI0, 0, dvecInData6, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* Mulitply 6th row with coeff from 1st output channel */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 11));
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 10));
+
+          /* Mulitply 6th row with coeff from 2nd output channel */
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 11));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 10));
+
+          /* rearrange input vectors */
+          IVP_DSEL2NX8I(dvecI1, dvecI0, 0, dvecInData7, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* Mulitply 7th row with coeff from 1st output channel */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 12));
+          MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 13));
+
+          /* Mulitply 7th row with coeff from 2nd output channel */
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 12));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \
+                            (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 13));
+        }   /* END for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable length for output stores */
+        varLen = XT_MIN(vectorizationWidth, outW - x);
+
+        /* Storing the 1st row output from 1st channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 1st row output from 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, enable2ndCh * bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 2 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 2 * coeffPitch3;
+      }   /* END for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+    }     /* end of if(y < outH) */
+  }       /* END for (x = 0; x < outW; x += vectorizationWidth)*/
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+* 7x7 MOW WHD Stride 4 - DEPTH 3                                                          *
+* If number of input channels is equal to 3                                               *
+* this function is called.                                                                *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_7x7j4d1), S8IX_MOW_WHD_DEPTH3) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \
+                      XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile);
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Pitches of Coefficient Data (WHDN) dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 *restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1, *restrict pdvecCoeff2, *restrict pdvecCoeff3, \
+  * restrict pdvecCoeff4;
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky;
+  int32_t varLen;
+  /* Number of output elements that can be generated
+   * with 2 input vector loads(64 way).*/
+  const int32_t vectorizationWidth = (((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1;
+
+  /* loop across output depth is unrolled by 4
+   * , producing lanes from 4 output channels
+   * in one iteration. Since vectorization width
+   * is just half the width of the accumulator,
+   * loop across output height is also unrolled by 2.
+   * Unrolling across output height makes it possible
+   * to utilize all the 64 MACs in the accumulator.
+   *
+   * Data loaded from the 2 input rows is concatenated
+   * in such a manner that lower half of the output
+   * vector gives the first output row and the upper
+   * half of the */
+  for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+  {
+    /* out of bound flag */
+    int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+
+    for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+    {
+      /* In order to handle odd heights*/
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize coeff and Bias data pointer to */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 4)   /* Loop across Output depth */
+      {
+        /* In order to handle odd depths*/
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+        int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+        /* loads and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+        xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+        dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+        IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh);
+        valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3);
+
+        pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh);
+        valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4);
+        /*************************** 1st inCh ******************************/
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x];
+
+        /* variable declarations for input and coeff vectors */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3, dvecCoeffData4;
+        MORPH_IDT_2Nx8 dvecInData11, dvecInData12;
+        MORPH_IDT_2Nx8 dvecInData21, dvecInData22;
+        MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4, dvecData5, dvecData6, dvecData7;
+
+        /* load coeff for all the 4 output channels*/
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch2);
+
+        for (ky = 0; ky < 7; ky++)   /* Loop across kernel height */
+        {
+          /* loads 1st input row */
+          pdvecIn = (MORPH_IDT_2Nx8 *) (pInput);
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+          /* loads 5th(corresponding to the 2nd output row) input row */
+          pdvecIn  = (MORPH_IDT_2Nx8 *) (pInput + stride * inDataPitch1 * enable2ndRow);
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          pInput += inDataPitch1;
+
+          /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+           * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is
+           * 128,129,130,131.........252,253,254,255, Data should be arranged  as
+           *
+           * dvecData1 : 0, 4, 8,...120,124,128,132,136,...248,252
+           * dvecData2 : 1, 5, 9,...121,125,129,133,137,...249,253
+           * dvecData3 : 2, 6,10,...122,126,130,134,138,...250,254
+           * dvecData4 : 3, 7,11,...123,127,131,135,139,...251,255
+           *
+           * Lower half of the vectors contain data from 1st input row and
+           * upper half of the vectors contain data from 2nd output row.
+           *
+           */
+
+          IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData3, dvecData1, dvecInData21, dvecInData11, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData2, dvecInData22, dvecInData12, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          dvecData5 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+          dvecData6 = IVP_SEL2NX8I(0, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+          dvecData7 = IVP_SEL2NX8I(0, dvecData3, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          /* multiplies data from two rows(Lower and upper half of dvecData)
+           * with coeff from 1st output channel and accumulate */
+
+          /* IVP_EXTRVRN_2X32 extracts the required coeff from the
+           * coeff vector. In every iteration ky is updated therefore it
+           * extracts coeff from the next coeff row in the successive ky
+           * iterations. "ky * coeffPitch1 + 4" extracts coeffs next to
+           * first four coeff in a row
+           */
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)),    \
+                           ky * coeffPitch1));
+          MORPH_OP_MULQA(dacc1, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32      \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), \
+                           (ky * coeffPitch1 + 4)));
+
+          /* multiplies data from two rows(Lower and upper half of dvecData)
+           * with coeff from 2nd output channel and accumulate */
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)),    \
+                           ky * coeffPitch1));
+          MORPH_OP_MULQA(dacc2, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32      \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), \
+                           (ky * coeffPitch1 + 4)));
+
+          /* multiplies data from two rows(Lower and upper half of dvecData)
+           * with coeff from 3rd output channel and accumulate */
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)),    \
+                           ky * coeffPitch1));
+          MORPH_OP_MULQA(dacc3, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32      \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), \
+                           (ky * coeffPitch1 + 4)));
+
+          /* multiplies data from two rows(Lower and upper half of dvecData)
+           * with coeff from 4th output channel and accumulate */
+          MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)),    \
+                           ky * coeffPitch1));
+          MORPH_OP_MULQA(dacc4, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32      \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), \
+                           (ky * coeffPitch1 + 4)));
+        }   /* end of for (ky = 0; ky < 7; ky++)*/
+
+        /********************************* 2nd inCh ***************************************/
+
+        /* initialize input data pointer */
+        pInput = &pInData[inDataPitch2 + inDataPitch1 * stride * y + stride * x];
+
+        /* load coeff for all the 4 output channels*/
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch2);
+
+        for (ky = 0; ky < 7; ky++)   /* Loop across kernel height */
+        {
+          /* loads 1st input row */
+          pdvecIn = (MORPH_IDT_2Nx8 *) (pInput);
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+          /* loads 5th(corresponding to the 2nd output row) input row */
+          pdvecIn  = (MORPH_IDT_2Nx8 *) (pInput + stride * inDataPitch1 * enable2ndRow);
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          pInput += inDataPitch1;
+
+          IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData3, dvecData1, dvecInData21, dvecInData11, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData2, dvecInData22, dvecInData12, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          dvecData5 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+          dvecData6 = IVP_SEL2NX8I(0, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+          dvecData7 = IVP_SEL2NX8I(0, dvecData3, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          /* multiplies data from two rows(Lower and upper half of dvecData)
+           * with coeff from 1st output channel and accumulate */
+
+          /* IVP_EXTRVRN_2X32 extracts the required coeff from the
+           * coeff vector. In every iteration ky is updated therefore it
+           * extracts coeff from the next coeff row in the successive ky
+           * iterations. "ky * coeffPitch1 + 4" extracts coeffs next to
+           * first four coeff in a row
+           */
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)),    \
+                           ky * coeffPitch1));
+          MORPH_OP_MULQA(dacc1, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32      \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), \
+                           (ky * coeffPitch1 + 4)));
+
+          /* multiplies data from two rows(Lower and upper half of dvecData)
+           * with coeff from 2nd output channel and accumulate */
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)),    \
+                           ky * coeffPitch1));
+          MORPH_OP_MULQA(dacc2, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32      \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), \
+                           (ky * coeffPitch1 + 4)));
+
+          /* multiplies data from two rows(Lower and upper half of dvecData)
+           * with coeff from 3rd output channel and accumulate */
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)),    \
+                           ky * coeffPitch1));
+          MORPH_OP_MULQA(dacc3, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32      \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), \
+                           (ky * coeffPitch1 + 4)));
+
+          /* multiplies data from two rows(Lower and upper half of dvecData)
+           * with coeff from 4th output channel and accumulate */
+          MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)),    \
+                           ky * coeffPitch1));
+          MORPH_OP_MULQA(dacc4, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32      \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), \
+                           (ky * coeffPitch1 + 4)));
+        }   /* end of for (ky = 0; ky < 7; ky++)*/
+
+        /************************************ 3rd inCh *************************************/
+        /* initialize input data pointer */
+        pInput = &pInData[2 * inDataPitch2 + inDataPitch1 * stride * y + stride * x];
+
+        /* load coeff for all the 4 output channels*/
+        IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2);
+        IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch2);
+
+        for (ky = 0; ky < 7; ky++)   /* Loop across kernel height */
+        {
+          /* loads 1st input row */
+          pdvecIn = (MORPH_IDT_2Nx8 *) (pInput);
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+          /* loads 5th(corresponding to the 2nd output row) input row */
+          pdvecIn  = (MORPH_IDT_2Nx8 *) (pInput + stride * inDataPitch1 * enable2ndRow);
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+          pInput += inDataPitch1;
+
+          IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData3, dvecData1, dvecInData21, dvecInData11, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData2, dvecInData22, dvecInData12, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          dvecData5 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+          dvecData6 = IVP_SEL2NX8I(0, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+          dvecData7 = IVP_SEL2NX8I(0, dvecData3, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+          /* multiplies data from two rows(Lower and upper half of dvecData)
+           * with coeff from 1st output channel and accumulate */
+
+          /* IVP_EXTRVRN_2X32 extracts the required coeff from the
+           * coeff vector. In every iteration ky is updated therefore it
+           * extracts coeff from the next coeff row in the successive ky
+           * iterations. "ky * coeffPitch1 + 4" extracts coeffs next to
+           * first four coeff in a row
+           */
+
+          MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)),    \
+                           ky * coeffPitch1));
+          MORPH_OP_MULQA(dacc1, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32      \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), \
+                           (ky * coeffPitch1 + 4)));
+
+          /* multiplies data from two rows(Lower and upper half of dvecData)
+           * with coeff from 2nd output channel and accumulate */
+          MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)),    \
+                           ky * coeffPitch1));
+          MORPH_OP_MULQA(dacc2, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32      \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), \
+                           (ky * coeffPitch1 + 4)));
+
+          /* multiplies data from two rows(Lower and upper half of dvecData)
+           * with coeff from 3rd output channel and accumulate */
+          MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)),    \
+                           ky * coeffPitch1));
+          MORPH_OP_MULQA(dacc3, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32      \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), \
+                           (ky * coeffPitch1 + 4)));
+
+          /* multiplies data from two rows(Lower and upper half of dvecData)
+           * with coeff from 4th output channel and accumulate */
+          MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)),    \
+                           ky * coeffPitch1));
+          MORPH_OP_MULQA(dacc4, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32      \
+                           (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), \
+                           (ky * coeffPitch1 + 4)));
+        }   /* end of for (ky = 0; ky < 7; ky++)*/
+
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable store count */
+        varLen = XT_MIN(outW - x, vectorizationWidth);
+
+        /* store the first half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4
+         */
+
+        /* Storing the first row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* extract the half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4
+         * and store in the next row
+         */
+
+        /* Storing the 2nd row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable2ndCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable2ndCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable3rdCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable3rdCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut4L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable4thCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable4thCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 4 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 4 * coeffPitch3;
+      } /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/
+    }   /* end of for (y = 0; y < outH; y += 2)*/
+  }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+}
+
+/******************************************************************************************
+*   xaiConvolved(VQ)3D_S_7x7j4d1I8S8IX_MOW_WHD
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 7x7 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 7x7 3D dilated convolution function and 7x7 */
+/*               3D VQ dilated convolution function for U8 bit and S8 bit     */
+/*               input data with input stride equal to 4                      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 7x7xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_7x7j4d1_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_7x7j4d1_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_7x7j4d1_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_7x7j4d1_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_7x7j4d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 7);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                           \
+                    XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE3D_EDGE(inTile, 3);
+    XAI_CHECK_STRIDE(param, 4);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  if (XAI_TILE3D_GET_DIM3(inTile) == 3)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_7x7j4d1), S8IX_MOW_WHD_DEPTH3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \
+                      XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile);
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Pitches of Coefficient Data (WHDN) dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 *restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2, * restrict pdvecCoeff3, \
+  * restrict pdvecCoeff4;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+  int32_t varLen;
+  /* Number of output elements that can be generated
+   * with 2 input vector loads(64 way).*/
+  const int32_t vectorizationWidth = (((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1;
+
+  /* loop across output depth is unrolled by 4
+   * , producing lanes from 4 output channels
+   * in one iteration. Since vectorization width
+   * is just half the width of the accumulator,
+   * loop across output height is also unrolled by 2.
+   * Unrolling across output height makes it possible
+   * to utilize all the 64 MACs in the accumulator.
+   *
+   * Data loaded from the 2 input rows is concatenated
+   * in such a manner that lower half of the output
+   * vector gives the first output row and the upper
+   * half of the */
+  for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+  {
+    /* out of bound flag */
+    int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+
+    for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+    {
+      /* In order to handle odd heights*/
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize coeff and Bias data pointer to */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 4)   /* Loop across Output depth */
+      {
+        /* In order to handle odd depths*/
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+        int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+        /* loads and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+        xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+        dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+        IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh);
+        valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3);
+
+        pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh);
+        valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4);
+
+        for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+        {
+          /* initialize input data pointer */
+          MORPH_IDT_SCALAR *pInput = &pInData[inCh * inDataPitch2 + \
+                                              inDataPitch1 * stride * y + stride * x];
+
+          /* variable declarations for input and coeff vectors */
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3, dvecCoeffData4;
+          MORPH_IDT_2Nx8 dvecInData11, dvecInData12;
+          MORPH_IDT_2Nx8 dvecInData21, dvecInData22;
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4, dvecData5, dvecData6, dvecData7;
+
+          /* load coeff for all the 4 output channels*/
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2);
+          IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch2);
+
+          for (ky = 0; ky < 7; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+            pdvecIn = (MORPH_IDT_2Nx8 *) (pInput);
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+            MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads 5th(corresponding to the 2nd output row) input row */
+            pdvecIn  = (MORPH_IDT_2Nx8 *) (pInput + stride * inDataPitch1 * enable2ndRow);
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+            MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            pInput += inDataPitch1;
+
+            /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+             * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is
+             * 128,129,130,131.........252,253,254,255, Data should be arranged  as
+             *
+             * dvecData1 : 0, 4, 8,...120,124,128,132,136,...248,252
+             * dvecData2 : 1, 5, 9,...121,125,129,133,137,...249,253
+             * dvecData3 : 2, 6,10,...122,126,130,134,138,...250,254
+             * dvecData4 : 3, 7,11,...123,127,131,135,139,...251,255
+             *
+             * Lower half of the vectors contain data from 1st input row and
+             * upper half of the vectors contain data from 2nd output row.
+             *
+             */
+
+            IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                          IVP_DSELI_8B_DEINTERLEAVE_1);
+            IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \
+                          IVP_DSELI_8B_DEINTERLEAVE_1);
+            IVP_DSEL2NX8I(dvecData3, dvecData1, dvecInData21, dvecInData11, \
+                          IVP_DSELI_8B_DEINTERLEAVE_1);
+            IVP_DSEL2NX8I(dvecData4, dvecData2, dvecInData22, dvecInData12, \
+                          IVP_DSELI_8B_DEINTERLEAVE_1);
+
+            dvecData5 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+            dvecData6 = IVP_SEL2NX8I(0, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+            dvecData7 = IVP_SEL2NX8I(0, dvecData3, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+            /* multiplies data from two rows(Lower and upper half of dvecData)
+             * with coeff from 1st output channel and accumulate */
+
+            /* IVP_EXTRVRN_2X32 extracts the required coeff from the
+             * coeff vector. In every iteration ky is updated therefore it
+             * extracts coeff from the next coeff row in the successive ky
+             * iterations. "ky * coeffPitch1 + 4" extracts coeffs next to
+             * first four coeff in a row
+             */
+
+            MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)),    \
+                             ky * coeffPitch1));
+            MORPH_OP_MULQA(dacc1, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32      \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), \
+                             (ky * coeffPitch1 + 4)));
+
+            /* multiplies data from two rows(Lower and upper half of dvecData)
+             * with coeff from 2nd output channel and accumulate */
+            MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)),    \
+                             ky * coeffPitch1));
+            MORPH_OP_MULQA(dacc2, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32      \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), \
+                             (ky * coeffPitch1 + 4)));
+
+            /* multiplies data from two rows(Lower and upper half of dvecData)
+             * with coeff from 3rd output channel and accumulate */
+            MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)),    \
+                             ky * coeffPitch1));
+            MORPH_OP_MULQA(dacc3, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32      \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), \
+                             (ky * coeffPitch1 + 4)));
+
+            /* multiplies data from two rows(Lower and upper half of dvecData)
+             * with coeff from 4th output channel and accumulate */
+            MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)),    \
+                             ky * coeffPitch1));
+            MORPH_OP_MULQA(dacc4, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32      \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), \
+                             (ky * coeffPitch1 + 4)));
+          }   /* end of for (ky = 0; ky < 7; ky++)*/
+        }     /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable store count */
+        varLen = XT_MIN(outW - x, vectorizationWidth);
+
+        /* store the first half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4
+         */
+
+        /* Storing the first row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* extract the half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4
+         * and store in the next row
+         */
+
+        /* Storing the 2nd row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable2ndCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable2ndCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable3rdCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable3rdCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut4L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable4thCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable4thCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 4 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 4 * coeffPitch3;
+      } /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/
+    }   /* end of for (y = 0; y < outH; y += 2)*/
+  }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  return(XAI_ERROR_STATUS());
+}
+
+
+/******************************************************************************************
+*   xaiConvolved(VQ)3D_S_7x7j1d2I8S8IX_MOW_WHD
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 7x7 3D convolution   */
+/*               with dilation = 2                                            */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 5x5 3D dilated convolution function and 5x5 */
+/*               3D VQ dilated convolution function for U8 bit and S8 bit     */
+/*               input data with input stride equal to 1                      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 7x7xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_7x7j1d2_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_7x7j1d2_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_7x7j1d2_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_7x7j1d2_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_7x7j1d2), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 7);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_TILE3D_EDGE(inTile, 6);
+    XAI_CHECK_STRIDE(param, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                         \
+                    XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_DILATION(param, 2);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN) */
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t dilationU     = XAI_CNN_CONV_GET_DILATION(param);
+
+  int32_t dilatedkSizeU = dilationU * (kSizeU - 1) + 1;
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((dilatedkSizeU / 2) * inDataPitch1 + (dilatedkSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 * restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+
+  /* 0 1 2 3 .. 62 63*/
+  xb_vec2Nx8 dvecPattern1 = IVP_SEQ2NX8();
+  /* 64 65 66 67 ...126 127*/
+  xb_vec2Nx8 dvecPattern2 = IVP_ADD2NX8(dvecPattern1, 64);
+
+  if (!typeFlag)
+  {
+    MORPH_OP_DSELI(dvecPattern2, dvecPattern1, \
+                   dvecPattern2, dvecPattern1, \
+                   IVP_DSELI_8B_INTERLEAVE_1);
+  }
+  else
+  {
+    MORPH_OP_DSELI(dvecPattern2, dvecPattern1, \
+                   dvecPattern2, dvecPattern1, \
+                   IVP_DSELI_INTERLEAVE_1);
+  }
+
+  /* In order to make the loop multiply-bound we are reducing the vectorization width
+     by extra values required for the kernel */
+  const int32_t vectorizationWidth = ((4 * XCHAL_IVPN_SIMD_WIDTH) - dilatedkSizeU) + 1;
+
+  for (x = 0; x < outW; x += vectorizationWidth)  /* Loop across output width */
+  {
+    int32_t remX = XT_MIN(vectorizationWidth, outW - x);
+
+    /* If (remX + kSizeEffU - 1) <= 2 * XCHAL_IVPN_SIMD_WIDTH,
+     * i.e. if the number of input data bytes corresponding to remX number of outputs
+     * is less than or equal to 2 * XCHAL_IVPN_SIMD_WIDTH, there is no need to load
+     * the next 64 input bytes*/
+    int32_t remXLoad = ((remX + dilatedkSizeU - 1) > 2 * XCHAL_IVPN_SIMD_WIDTH) ? 1 : 0;
+
+    for (y = 0; y < outH; y++) /* Loop across output height */
+    {
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+      /* initialize coeff and Bias data pointer*/
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh++)  /* Loop across Output depth */
+      {
+        /* load and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2;
+        dacc1 = dacc2 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc2, hvecBias1, hvecBias1);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecIn = (MORPH_IDT_2Nx8 *) pInput;
+
+        for (inCh = 0; inCh < numInCh; inCh++)  /* Loop across input channels */
+        {
+          /* vectors for coeff and input loads */
+          xb_vec2Nx8 dvecCoeffData1;
+          xb_vec2Nx8 dvecInData11, dvecInData21, dvecInData31, dvecInData41, \
+                     dvecInData51, dvecInData61, dvecInData71;
+          xb_vec2Nx8 dvecInData12, dvecInData22, dvecInData32, dvecInData42, \
+                     dvecInData52, dvecInData62, dvecInData72;
+
+          /* load data from first input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData32, dvecInData31, dvecInData32, dvecInData31, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data from 4th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData42, dvecInData41, dvecInData42, dvecInData41, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data from 5th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData52, dvecInData51, dvecInData52, dvecInData51, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data from 6th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData62, dvecInData61, dvecInData62, dvecInData61, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data from 7th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData71, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData72, vaInData, pdvecIn, inDataPitch2 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH - dilationU * 6 * inDataPitch1);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData72, dvecInData71, dvecInData72, dvecInData71, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load 1st row of coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply and accumulate 1st set of 4 coefficients from 1st row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData11, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData12, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients from 1st row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData11, dvecInData11,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData12, dvecInData12,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 2nd row of coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply and accumulate 1st set of 4 coefficients from 2nd row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData21, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData22, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients from 2nd row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData21, dvecInData21,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData22, dvecInData22,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 3rd row of coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply and accumulate 1st set of 4 coefficients from 3rd row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData31, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData32, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients from 3rd row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData31, dvecInData31,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData32, dvecInData32,
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 4th row of coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply and accumulate 1st set of 4 coefficients from 4th row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData41, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData42, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients from 4th row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData41, dvecInData41,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData42, dvecInData42,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 5th row of coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply and accumulate 1st set of 4 coefficients from 5th row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData51, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData52, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients from 5th row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData51, dvecInData51,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData52, dvecInData52,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 6th row of coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply and accumulate 1st set of 4 coefficients from 6th row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData61, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData62, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients from 6th row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData61, dvecInData61,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData62, dvecInData62,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 7th row of coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply and accumulate 1st set of 4 coefficients from 7th row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData71, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData72, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients from 7th row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData71, dvecInData71,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData72, dvecInData72,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+        }  /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvec1L, dvec2L;
+        xb_vec2Nx8 dvec1H, dvec2H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Interleave odd and even indices */
+        xb_vec2Nx8 dvecOut1L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern1);
+        xb_vec2Nx8 dvecOut2L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern2);
+        xb_vec2Nx8 dvecOut1H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern1);
+        xb_vec2Nx8 dvecOut2H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern2);
+
+        /* Storing the first depth output , 1st row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remX - \
+                       2 * XCHAL_IVPN_SIMD_WIDTH);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += outDataPitch2 * bytesPerPixel;
+        pCoeff  += coeffPitch3;
+      } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/
+    }   /* end of for (y = 0; y < outH; y++)*/
+  }     /* end of for (x = 0; x < outW; x += 2 * vectorizationWidth)*/
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+*   xaiConvolved(VQ)3D_S_7x7j1d4I8MOW_WHD
+*  ***************************************************************************************/
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for 7x7 3D convolution   */
+/*               with dilation = 4                                            */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate 7x7 3D dilated convolution function and 7x7 */
+/*               3D VQ dilated convolution function for U8 bit and S8 bit     */
+/*               input data with input stride equal to 1                      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is 7x7xDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_7x7j1d4_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_7x7j1d4_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_7x7j1d4_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_7x7j1d4_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_7x7j1d4), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_KERNEL_SIZE(coeffTile, 7);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_TILE3D_EDGE(inTile, 12);
+    XAI_CHECK_STRIDE(param, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                         \
+                    XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_DILATION(param, 4);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Pitches of Coefficient Data (WHDN) in dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN) */
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t dilationU     = XAI_CNN_CONV_GET_DILATION(param);
+
+  int32_t dilatedkSizeU = dilationU * (kSizeU - 1) + 1;
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((dilatedkSizeU / 2) * inDataPitch1 + (dilatedkSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 * restrict pdvecIn;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y;
+
+  /* In order to make the loop multiply-bound we are reducing the vectorization width
+     by extra values required for the kernel */
+  const int32_t vectorizationWidth = ((4 * XCHAL_IVPN_SIMD_WIDTH) - dilatedkSizeU) + 1;
+
+  for (x = 0; x < outW; x += vectorizationWidth)  /* Loop across output width */
+  {
+    int32_t remX = XT_MIN(vectorizationWidth, outW - x);
+
+    /* If (remX + kSizeEffU - 1) <= 2 * XCHAL_IVPN_SIMD_WIDTH,
+     * i.e. if the number of input data bytes corresponding to remX number of outputs
+     * is less than or equal to 2 * XCHAL_IVPN_SIMD_WIDTH, there is no need to load
+     * the next 64 input bytes*/
+    int32_t remXLoad = ((remX + dilatedkSizeU - 1) > 2 * XCHAL_IVPN_SIMD_WIDTH) ? 1 : 0;
+
+    for (y = 0; y < outH; y++) /* Loop across output height */
+    {
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x];
+
+      /* initialize coeff and Bias data pointer*/
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh++)  /* Loop across Output depth */
+      {
+        /* load and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2;
+        dacc1 = dacc2 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc2, hvecBias1, hvecBias1);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecIn = (MORPH_IDT_2Nx8 *) pInput;
+
+        for (inCh = 0; inCh < numInCh; inCh++)  /* Loop across input channels */
+        {
+          /* vectors for coeff and input loads */
+          xb_vec2Nx8 dvecCoeffData1;
+          xb_vec2Nx8 dvecInData11, dvecInData21, dvecInData31, dvecInData41, \
+                     dvecInData51, dvecInData61, dvecInData71;
+          xb_vec2Nx8 dvecInData12, dvecInData22, dvecInData32, dvecInData42, \
+                     dvecInData52, dvecInData62, dvecInData72;
+
+          /* load data from first input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data from 2nd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data from 3rd input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData32, dvecInData31, dvecInData32, dvecInData31, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecInData32, dvecInData31, dvecInData32, dvecInData31, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data from 4th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData42, dvecInData41, dvecInData42, dvecInData41, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecInData42, dvecInData41, dvecInData42, dvecInData41, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data from 5th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData52, dvecInData51, dvecInData52, dvecInData51, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecInData52, dvecInData51, dvecInData52, dvecInData51, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data from 6th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, dilationU * inDataPitch1 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData62, dvecInData61, dvecInData62, dvecInData61, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecInData62, dvecInData61, dvecInData62, dvecInData61, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load data from 7th input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn);
+          MORPH_OP_LOAD_2Nx8(dvecInData71, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+          MORPH_OP_LOAD_2Nx8(dvecInData72, vaInData, pdvecIn, inDataPitch2 - \
+                             remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH - dilationU * 6 * inDataPitch1);
+
+          /*Separate odd and even indices */
+          IVP_DSEL2NX8I(dvecInData72, dvecInData71, dvecInData72, dvecInData71, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecInData72, dvecInData71, dvecInData72, dvecInData71, \
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load 1st row of coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply and accumulate 1st set of 4 coefficients from 1st row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData11, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData12, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients from 1st row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData11, dvecInData11,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData12, dvecInData12,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 2nd row of coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply and accumulate 1st set of 4 coefficients from 2nd row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData21, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData22, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients from 2nd row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData21, dvecInData21,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData22, dvecInData22,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 3rd row of coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply and accumulate 1st set of 4 coefficients from 3rd row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData31, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData32, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients from 3rd row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData31, dvecInData31,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData32, dvecInData32,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 4th row of coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply and accumulate 1st set of 4 coefficients from 4th row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData41, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData42, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients from 4th row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData41, dvecInData41,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData42, dvecInData42,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 5th row of coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply and accumulate 1st set of 4 coefficients from 5th row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData51, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData52, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients from 5th row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData51, dvecInData51,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData52, dvecInData52,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 6th row of coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply and accumulate 1st set of 4 coefficients from 6th row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData61, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData62, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients from 6th row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData61, dvecInData61,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData62, dvecInData62,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+          /* load 7th row of coefficients */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+          /* Multiply and accumulate 1st set of 4 coefficients from 7th row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecInData71, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecInData72, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+          /* Multiply and accumulate 2nd set of 4 coefficients from 7th row for all the outputs */
+          MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData71, dvecInData71,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData72, dvecInData72,                  \
+                                                 IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+        }  /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        if (!typeFlag)
+        {
+          MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                         IVP_DSELI_8B_INTERLEAVE_1);
+          MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                         IVP_DSELI_8B_INTERLEAVE_1);
+        }
+        else
+        {
+          MORPH_OP_DSELI(dvecOut1H, dvecOut1L, dvecOut1H, dvecOut1L, \
+                         IVP_DSELI_INTERLEAVE_1);
+          MORPH_OP_DSELI(dvecOut2H, dvecOut2L, dvecOut2H, dvecOut2L, \
+                         IVP_DSELI_INTERLEAVE_1);
+          MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                         IVP_DSELI_INTERLEAVE_2);
+          MORPH_OP_DSELI(dvecOut2H, dvecOut1H, dvecOut2H, dvecOut1H, \
+                         IVP_DSELI_INTERLEAVE_2);
+        }
+
+        /* Storing the first depth output , 1st row */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remX);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remX - \
+                       2 * XCHAL_IVPN_SIMD_WIDTH);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                       (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH));
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += outDataPitch2 * bytesPerPixel;
+        pCoeff  += coeffPitch3;
+      } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/
+    }   /* end of for (y = 0; y < outH; y++)*/
+  }     /* end of for (x = 0; x < outW; x += 2 * vectorizationWidth)*/
+  return(XAI_ERROR_STATUS());
+}
+
+
+/******************************************************************************************
+* MxN MOW WHD Stride 1 - DEPTH 3                                                          *
+* If number of input channels is equal to 3                                               *
+* this function is called.                                                                *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_MxNj1d1), S8IX_MOW_WHD_DEPTH3) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \
+                      XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile);
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeX = XAI_TILE4D_GET_DIM1(coeffTile);
+  const int32_t kSizeY = XAI_TILE4D_GET_DIM2(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Pitches of Coefficient Data (WHDN) */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  uint8_t leftEdge, topEdge;
+  if ((kSizeX % 2) != 0)
+  {
+    leftEdge = kSizeX / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kSizeX / 2) : ((kSizeX / 2) - 1);
+  }
+
+  if ((kSizeY % 2) != 0)
+  {
+    topEdge = kSizeY / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kSizeY / 2) : ((kSizeY / 2) - 1);
+  }
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 * restrict pdvecIn1;
+  MORPH_IDT_2Nx8 * restrict pdvecIn2;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky;
+  const int32_t vectorizationWidth = 2 * XCHAL_IVPN_SIMD_WIDTH;
+  int32_t varLen;
+
+  if (kSizeX > 12)
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(64, inW - x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* handles odd output row */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+          /* variable declarations for input and coeff vectors */
+
+          xb_vec2Nx8 dvecCoeffData11;
+          xb_vec2Nx8 dvecCoeffData21;
+
+          /* dvecInData11 refers to 1st input row, first 64(or lesser) elements
+           * and dvecInData12 refers to next few left out elements of the same row
+           * required to compute one 64 way output vector(To compute one 64 way
+           * output vector, we require 64 + edge1 + edge2 number of input elements)
+           */
+          xb_vec2Nx8 dvecInData11, dvecInData12;
+          xb_vec2Nx8 dvecInData21, dvecInData22;
+          /***************************** 1st inCh ****************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *)  (pInput + inDataPitch1 * enable2ndRow);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads 2nd input row */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* load 1 row of coeff for 1st output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+            /* load 1 row of coeff for 2nd output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* multiples loaded input data with first four coeff */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              2));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              2));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              2));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              2));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              3));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              3));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              3));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              3));
+          } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+
+          /***************************** 2nd inCh ****************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *)  (pInput + inDataPitch2 + inDataPitch1 * enable2ndRow);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads 2nd input row */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* load 1 row of coeff for 1st output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+            /* load 1 row of coeff for 2nd output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* multiples loaded input data with first four coeff */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              2));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              2));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              2));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              2));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              3));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              3));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              3));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              3));
+          } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+
+          /***************************** 3rd inCh ****************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *)  (pInput + 2 * inDataPitch2 + inDataPitch1 * enable2ndRow);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads 2nd input row */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* load 1 row of coeff for 1st output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+            /* load 1 row of coeff for 2nd output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* multiples loaded input data with first four coeff */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              2));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              2));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              2));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              2));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              3));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              3));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              3));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              3));
+          } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first row , first depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndRow * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                                enable2ndRow * outDataPitch1) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndRow * enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else if (kSizeX > 8)
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(64, inW - x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* handles odd output row */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel*/
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+          /* variable declarations for input and coeff vectors */
+
+          xb_vec2Nx8 dvecCoeffData11;
+          xb_vec2Nx8 dvecCoeffData21;
+
+          /* dvecInData11 refers to 1st input row, first 64(or lesser) elements
+           * and dvecInData12 refers to next few left out elements of the same row
+           * required to compute one 64 way output vector(To compute one 64 way
+           * output vector, we require 64 + edge1 + edge2 number of input elements)
+           */
+          xb_vec2Nx8 dvecInData11, dvecInData12;
+          xb_vec2Nx8 dvecInData21, dvecInData22;
+          /************************** 1st inCh *******************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *)  (pInput + inDataPitch1 * enable2ndRow);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads 2nd input row */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* load 1 row of coeff for 1st output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+            /* load 1 row of coeff for 2nd output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* multiples loaded input data with first four coeff */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              2));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              2));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              2));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              2));
+          } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+
+          /************************** 2nd inCh *******************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *)  (pInput + inDataPitch2 + inDataPitch1 * enable2ndRow);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads 2nd input row */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* load 1 row of coeff for 1st output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+            /* load 1 row of coeff for 2nd output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* multiples loaded input data with first four coeff */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              2));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              2));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              2));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              2));
+          } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+
+          /************************** 3rd inCh *******************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *)  (pInput + 2 * inDataPitch2 + inDataPitch1 * enable2ndRow);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads 2nd input row */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* load 1 row of coeff for 1st output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+            /* load 1 row of coeff for 2nd output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* multiples loaded input data with first four coeff */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              2));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              2));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              2));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              2));
+          } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first row , first depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndRow * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                                enable2ndRow * outDataPitch1) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndRow * enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else if (kSizeX > 4)
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(64, inW - x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* handles odd output row */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+          /* variable declarations for input and coeff vectors */
+
+          xb_vec2Nx8 dvecCoeffData11;
+          xb_vec2Nx8 dvecCoeffData21;
+
+          /* dvecInData11 refers to 1st input row, first 64(or lesser) elements
+           * and dvecInData12 refers to next few left out elements of the same row
+           * required to compute one 64 way output vector(To compute one 64 way
+           * output vector, we require 64 + edge1 + edge2 number of input elements)
+           */
+          xb_vec2Nx8 dvecInData11, dvecInData12;
+          xb_vec2Nx8 dvecInData21, dvecInData22;
+
+          /******************************* 1st inCh ************************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1 * enable2ndRow);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads 2nd input row */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* load 1 row of coeff for 1st output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+            /* load 1 row of coeff for 2nd output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* multiples loaded input data with first four coeff */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+          } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+
+          /******************************* 2nd inCh ************************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2 + inDataPitch1 * enable2ndRow);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads 2nd input row */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* load 1 row of coeff for 1st output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+            /* load 1 row of coeff for 2nd output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* multiples loaded input data with first four coeff */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+          } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+
+          /******************************* 3rd inCh ************************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2 + inDataPitch1 * enable2ndRow);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads 2nd input row */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* load 1 row of coeff for 1st output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+            /* load 1 row of coeff for 2nd output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* multiples loaded input data with first four coeff */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            /* right rotate the input vectors by 4
+             * in order to multiply with next column of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+            /* multiples input data with next four coeffs from the same row */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              1));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              1));
+          } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first row , first depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndRow * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                                enable2ndRow * outDataPitch1) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndRow * enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(64, inW - x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* handles odd output row */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+          /* variable declarations for input and coeff vectors */
+
+          xb_vec2Nx8 dvecCoeffData11;
+          xb_vec2Nx8 dvecCoeffData21;
+
+          /* dvecInData11 refers to 1st input row, first 64(or lesser) elements
+           * and dvecInData12 refers to next few left out elements of the same row
+           * required to compute one 64 way output vector(To compute one 64 way
+           * output vector, we require 64 + edge1 + edge2 number of input elements)
+           */
+          xb_vec2Nx8 dvecInData11, dvecInData12;
+          xb_vec2Nx8 dvecInData21, dvecInData22;
+
+          /************************* 1st inCh *******************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1 * enable2ndRow);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads 2nd input row */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* load 1 row of coeff for 1st output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+            /* load 1 row of coeff for 2nd output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* multiples loaded input data with first four coeff */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+          } /* for (ky = 0; ky < kSizeY; ky++)*/
+
+          /************************* 2nd inCh *******************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2 + inDataPitch1 * enable2ndRow);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads 2nd input row */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* load 1 row of coeff for 1st output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+            /* load 1 row of coeff for 2nd output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* multiples loaded input data with first four coeff */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+          } /* for (ky = 0; ky < kSizeY; ky++)*/
+
+          /************************* 3rd inCh *******************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2 + inDataPitch1 * enable2ndRow);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads 2nd input row */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \
+                               2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* load 1 row of coeff for 1st output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+            /* load 1 row of coeff for 2nd output channel */
+            IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* multiples loaded input data with first four coeff */
+            MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+
+            MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                              0));
+            MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                              (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                              0));
+          } /* for (ky = 0; ky < kSizeY; ky++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first row , first depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndRow * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                                enable2ndRow * outDataPitch1) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndRow * enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+}
+
+/******************************************************************************************
+*   xaiConvolved(VQ)3D_S_MxNj1d1I8S8IX_MOW_WHD
+*  ***************************************************************************************/
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for MxN 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate MxN 3D dilated convolution function and MxN */
+/*               3D VQ dilated convolution function for U8 bit and S8 bit     */
+/*               input data with input stride equal to 1                      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is MxNxDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_MxNj1d1_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_MxNj1d1_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_MxNj1d1_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_MxNj1d1_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_MxNj1d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(coeffTile) <= 16, XAI_ERR_KSIZE,           \
+                    "Kernel width = %d, which should be less than or equal to 16", \
+                    XAI_TILE4D_GET_DIM1(coeffTile));
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(coeffTile) <= 16, XAI_ERR_KSIZE,              \
+                    "\nKernel height = %d, which should be less than or equal to 16", \
+                    XAI_TILE4D_GET_DIM2(coeffTile));
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_STRIDE(param, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  if (XAI_TILE3D_GET_DIM3(inTile) == 3)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_MxNj1d1), S8IX_MOW_WHD_DEPTH3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \
+                      XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile);
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeX = XAI_TILE4D_GET_DIM1(coeffTile);
+  const int32_t kSizeY = XAI_TILE4D_GET_DIM2(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Pitches of Coefficient Data (WHDN) */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  uint8_t leftEdge, topEdge;
+  if ((kSizeX % 2) != 0)
+  {
+    leftEdge = kSizeX / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kSizeX / 2) : ((kSizeX / 2) - 1);
+  }
+
+  if ((kSizeY % 2) != 0)
+  {
+    topEdge = kSizeY / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kSizeY / 2) : ((kSizeY / 2) - 1);
+  }
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 * restrict pdvecIn1;
+  MORPH_IDT_2Nx8 * restrict pdvecIn2;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+  const int32_t vectorizationWidth = 2 * XCHAL_IVPN_SIMD_WIDTH;
+  int32_t varLen;
+
+  if (kSizeX > 12)
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* handles odd output row */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vec2Nx8 dvecCoeffData11;
+            xb_vec2Nx8 dvecCoeffData21;
+
+            /* dvecInData11 refers to 1st input row, first 64(or lesser) elements
+             * and dvecInData12 refers to next few left out elements of the same row
+             * required to compute one 64 way output vector(To compute one 64 way
+             * output vector, we require 64 + edge1 + edge2 number of input elements)
+             */
+            xb_vec2Nx8 dvecInData11, dvecInData12;
+            xb_vec2Nx8 dvecInData21, dvecInData22;
+
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+            pdvecIn2 = (MORPH_IDT_2Nx8 *)  (pInput + inCh * inDataPitch2 + inDataPitch1 * enable2ndRow);
+
+            for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+              valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              /* loads 2nd input row */
+              vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+              MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+              /* multiples loaded input data with first four coeff */
+              MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+
+              MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+
+              MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                2));
+              MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                2));
+
+              MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                2));
+              MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                2));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                3));
+              MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                3));
+
+              MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                3));
+              MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                3));
+            } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first row , first depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndRow * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                                enable2ndRow * outDataPitch1) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndRow * enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else if (kSizeX > 8)
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* handles odd output row */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel*/
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vec2Nx8 dvecCoeffData11;
+            xb_vec2Nx8 dvecCoeffData21;
+
+            /* dvecInData11 refers to 1st input row, first 64(or lesser) elements
+             * and dvecInData12 refers to next few left out elements of the same row
+             * required to compute one 64 way output vector(To compute one 64 way
+             * output vector, we require 64 + edge1 + edge2 number of input elements)
+             */
+            xb_vec2Nx8 dvecInData11, dvecInData12;
+            xb_vec2Nx8 dvecInData21, dvecInData22;
+
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+            pdvecIn2 = (MORPH_IDT_2Nx8 *)  (pInput + inCh * inDataPitch2 + inDataPitch1 * enable2ndRow);
+
+            for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+              valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              /* loads 2nd input row */
+              vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+              MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+              /* multiples loaded input data with first four coeff */
+              MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+
+              MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+
+              MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                2));
+              MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                2));
+
+              MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                2));
+              MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                2));
+            } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first row , first depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndRow * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                                enable2ndRow * outDataPitch1) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndRow * enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else if (kSizeX > 4)
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* handles odd output row */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vec2Nx8 dvecCoeffData11;
+            xb_vec2Nx8 dvecCoeffData21;
+
+            /* dvecInData11 refers to 1st input row, first 64(or lesser) elements
+             * and dvecInData12 refers to next few left out elements of the same row
+             * required to compute one 64 way output vector(To compute one 64 way
+             * output vector, we require 64 + edge1 + edge2 number of input elements)
+             */
+            xb_vec2Nx8 dvecInData11, dvecInData12;
+            xb_vec2Nx8 dvecInData21, dvecInData22;
+
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+            pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + inDataPitch1 * enable2ndRow);
+
+            for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+              valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              /* loads 2nd input row */
+              vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+              MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+              /* multiples loaded input data with first four coeff */
+              MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+
+              MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+
+              MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+            } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh ], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first row , first depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndRow * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                                enable2ndRow * outDataPitch1) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndRow * enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* handles odd output row */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vec2Nx8 dvecCoeffData11;
+            xb_vec2Nx8 dvecCoeffData21;
+
+            /* dvecInData11 refers to 1st input row, first 64(or lesser) elements
+             * and dvecInData12 refers to next few left out elements of the same row
+             * required to compute one 64 way output vector(To compute one 64 way
+             * output vector, we require 64 + edge1 + edge2 number of input elements)
+             */
+            xb_vec2Nx8 dvecInData11, dvecInData12;
+            xb_vec2Nx8 dvecInData21, dvecInData22;
+
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+            pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + inDataPitch1 * enable2ndRow);
+
+            for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+              valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              /* loads 2nd input row */
+              vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+              MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+              /* multiples loaded input data with first four coeff */
+              MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+
+              MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+            } /* for (ky = 0; ky < kSizeY; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first row , first depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndRow * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                                enable2ndRow * outDataPitch1) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         enable2ndRow * enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+* MxN MOW WHD Stride 2 - DEPTH 3                                                          *
+* If number of input channels is equal to 3                                               *
+* this function is called.                                                                *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_MxNj2d1), S8IX_MOW_WHD_DEPTH3) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \
+                      XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile);
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeX = XAI_TILE4D_GET_DIM1(coeffTile);
+  const int32_t kSizeY = XAI_TILE4D_GET_DIM2(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Pitches of Coefficient Data (WHDN) dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  uint8_t leftEdge, topEdge;
+  if ((kSizeX % 2) != 0)
+  {
+    leftEdge = kSizeX / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kSizeX / 2) : ((kSizeX / 2) - 1);
+  }
+
+  if ((kSizeY % 2) != 0)
+  {
+    topEdge = kSizeY / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kSizeY / 2) : ((kSizeY / 2) - 1);
+  }
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8* restrict pdvecIn1;
+  MORPH_IDT_2Nx8* restrict pdvecIn2;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky;
+  int32_t varLen;
+
+  /* Number of output elements that can be generated
+   * with 2 input vector loads(64 way).*/
+  const int32_t vectorizationWidth = (((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeX) / stride) + 1;
+
+  if (kSizeX > 8)
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)    /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* In order to handle odd output height  */
+        int32_t enable2Row = XT_SALT(y, outH - 1);
+
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)];
+
+        /* initialize coeff and Bias data pointer */
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* In order to handle odd output depth  */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+          xb_vec2Nx8 dvecCoeffData1;
+          xb_vec2Nx8 dvecCoeffData2;
+
+          xb_vec2Nx8 dvecInData11, dvecInData12;
+          xb_vec2Nx8 dvecInData21, dvecInData22;
+
+          /************************* 1st inCh *****************************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + stride * inDataPitch1 * enable2Row);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            IVP_LA2NX8_XP(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            IVP_LA2NX8_XP(dvecInData12, vaInData, pdvecIn1, \
+                          inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads Next(3rd) input row, corresponding to 2nd output row */
+
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            IVP_LA2NX8_XP(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            IVP_LA2NX8_XP(dvecInData22, vaInData, pdvecIn2, \
+                          inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* Re-arrange the data in the desired format                                    */
+            /* Assume input as 1,2,3,4,5,6,7...127                                          */
+            /* After re-arrangement using DSEL operation, updated vectors would be */
+            /* dvecInData1 : 1,  3,  5,...121                                              */
+            /* dvecInData2 : 2,  4,  6,...122                                              */
+
+            IVP_DSEL2NX8I(dvecInData12, dvecInData11,
+                          dvecInData12, dvecInData11, IVP_DSELI_8B_DEINTERLEAVE_1);
+            IVP_DSEL2NX8I(dvecInData22, dvecInData21,
+                          dvecInData22, dvecInData21, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+            /* load 1st row of coeffs for both output channels */
+            IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+            IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* rearrange the coeff vectors. Separate even and odd coeff
+             * so that MUL4T can be used
+             */
+            IVP_DSEL2NX8I(dvecCoeffData2, dvecCoeffData1,
+                          dvecCoeffData2, dvecCoeffData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+            /* multiply 1st input row with 1st 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+            /* multiply 2nd input row with 1st 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+            /* multiply 1st input row with 1st 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+
+            /* multiply 2nd input row with 1st 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+
+            /* right rotate the input vectors
+             * in order to multiply with next columns of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+
+            /* multiply 1st input row with next 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+            /* multiply 2nd input row with next 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+
+            /* multiply 1st input row with next 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9));
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9));
+
+            /* multiply 2nd input row with next 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9));
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9));
+          }   /* for (ky = 0; ky < kSizeY; ky++)*/
+
+          /************************* 2nd inCh *****************************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2 + \
+                                         stride * inDataPitch1 * enable2Row);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            IVP_LA2NX8_XP(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            IVP_LA2NX8_XP(dvecInData12, vaInData, pdvecIn1, \
+                          inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads Next(3rd) input row, corresponding to 2nd output row */
+
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            IVP_LA2NX8_XP(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            IVP_LA2NX8_XP(dvecInData22, vaInData, pdvecIn2, \
+                          inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* Arrange input vectors required for Quad multiply*/
+            IVP_DSEL2NX8I(dvecInData12, dvecInData11,
+                          dvecInData12, dvecInData11, IVP_DSELI_8B_DEINTERLEAVE_1);
+            IVP_DSEL2NX8I(dvecInData22, dvecInData21,
+                          dvecInData22, dvecInData21, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+            /* load 1st row of coeffs for both output channels */
+            IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+            IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* rearrange the coeff vectors. Separate even and odd coeff
+             * so that MUL4T can be used
+             */
+            IVP_DSEL2NX8I(dvecCoeffData2, dvecCoeffData1,
+                          dvecCoeffData2, dvecCoeffData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+            /* multiply 1st input row with 1st 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+            /* multiply 2nd input row with 1st 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+            /* multiply 1st input row with 1st 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+
+            /* multiply 2nd input row with 1st 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+
+            /* right rotate the input vectors
+             * in order to multiply with next columns of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+
+            /* multiply 1st input row with next 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+            /* multiply 2nd input row with next 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+
+            /* multiply 1st input row with next 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9));
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9));
+
+            /* multiply 2nd input row with next 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9));
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9));
+          }   /* for (ky = 0; ky < kSizeY; ky++)*/
+
+          /************************* 3rd inCh *****************************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2 + \
+                                         stride * inDataPitch1 * enable2Row);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            IVP_LA2NX8_XP(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            IVP_LA2NX8_XP(dvecInData12, vaInData, pdvecIn1, \
+                          inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads Next(3rd) input row, corresponding to 2nd output row */
+
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            IVP_LA2NX8_XP(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            IVP_LA2NX8_XP(dvecInData22, vaInData, pdvecIn2, \
+                          inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* Arrange input vectors required for Quad multiply*/
+            IVP_DSEL2NX8I(dvecInData12, dvecInData11,
+                          dvecInData12, dvecInData11, IVP_DSELI_8B_DEINTERLEAVE_1);
+            IVP_DSEL2NX8I(dvecInData22, dvecInData21,
+                          dvecInData22, dvecInData21, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+            /* load 1st row of coeffs for both output channels */
+            IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+            IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* rearrange the coeff vectors. Separate even and odd coeff
+             * so that MUL4T can be used
+             */
+            IVP_DSEL2NX8I(dvecCoeffData2, dvecCoeffData1,
+                          dvecCoeffData2, dvecCoeffData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+            /* multiply 1st input row with 1st 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+            /* multiply 2nd input row with 1st 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+            /* multiply 1st input row with 1st 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+
+            /* multiply 2nd input row with 1st 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+
+            /* right rotate the input vectors
+             * in order to multiply with next columns of
+             * coeff in the next iteration
+             */
+            dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+            dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+
+            /* multiply 1st input row with next 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+            /* multiply 2nd input row with next 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+
+            /* multiply 1st input row with next 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9));
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9));
+
+            /* multiply 2nd input row with next 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9));
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9));
+          }   /* for (ky = 0; ky < kSizeY; ky++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut11L, dvecOut12L, dvecOut21L, dvecOut22L;
+          xb_vec2Nx8 dvecOut11H, dvecOut12H, dvecOut21H, dvecOut22H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* variable length for output stores */
+          varLen = XT_MIN(vectorizationWidth, outW - x);
+
+          /* Storing the first output channel, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut11L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut11H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the first output channel, second row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut12L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2Row);
+          IVP_SAV2NX8_XP(dvecOut12H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2Row);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd output channel, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut21L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut21H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd output channel, 2nd row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                                outDataPitch1 * enable2Row) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut22L, vaOutData, pdvecOut, bytesPerPixel * \
+                         varLen * enable2ndCh * enable2Row);
+          IVP_SAV2NX8_XP(dvecOut22H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh * enable2Row);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)    /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* In order to handle odd output height */
+        int32_t enable2Row = XT_SALT(y, outH - 1);
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)];
+
+        /* initialize coeff and Bias data pointer */
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* In order to handle odd output depth */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+          xb_vec2Nx8 dvecCoeffData1;
+          xb_vec2Nx8 dvecCoeffData2;
+
+          xb_vec2Nx8 dvecInData11, dvecInData12;
+          xb_vec2Nx8 dvecInData21, dvecInData22;
+          /****************************** 1st inCh ******************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + stride * inDataPitch1 * enable2Row);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            IVP_LA2NX8_XP(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            IVP_LA2NX8_XP(dvecInData12, vaInData, pdvecIn1, \
+                          inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads Next(3rd) input row, corresponding to 2nd output row */
+
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            IVP_LA2NX8_XP(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            IVP_LA2NX8_XP(dvecInData22, vaInData, pdvecIn2, \
+                          inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* Re-arrange the data in the desired format                                    */
+            /* Assume input as 1,2,3,4,5,6,7...127                                          */
+            /* After re-arrangement using DSEL operation, updated vectors would be */
+            /* dvecInData1 : 1,  3,  5,...121                                              */
+            /* dvecInData2 : 2,  4,  6,...122                                              */
+
+            IVP_DSEL2NX8I(dvecInData12, dvecInData11,
+                          dvecInData12, dvecInData11, IVP_DSELI_8B_DEINTERLEAVE_1);
+            IVP_DSEL2NX8I(dvecInData22, dvecInData21,
+                          dvecInData22, dvecInData21, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+            /* load 1st row of coeffs for both output channels */
+            IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+            IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* rearrange the coeff vectors. Separate even and odd coeff
+             * so that MUL4T can be used
+             */
+            IVP_DSEL2NX8I(dvecCoeffData2, dvecCoeffData1,
+                          dvecCoeffData2, dvecCoeffData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+            /* multiply 1st input row with 1st 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+            /* multiply 2nd input row with 1st 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+            /* multiply 1st input row with 1st 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+
+            /* multiply 2nd input row with 1st 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+          }   /* for (ky = 0; ky < kSizeY; ky++)*/
+
+          /****************************** 2nd inCh ******************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2 + \
+                                         stride * inDataPitch1 * enable2Row);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            IVP_LA2NX8_XP(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            IVP_LA2NX8_XP(dvecInData12, vaInData, pdvecIn1, \
+                          inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads Next(3rd) input row, corresponding to 2nd output row */
+
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            IVP_LA2NX8_XP(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            IVP_LA2NX8_XP(dvecInData22, vaInData, pdvecIn2, \
+                          inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* Arrange input vectors required for Quad multiply*/
+            IVP_DSEL2NX8I(dvecInData12, dvecInData11,
+                          dvecInData12, dvecInData11, IVP_DSELI_8B_DEINTERLEAVE_1);
+            IVP_DSEL2NX8I(dvecInData22, dvecInData21,
+                          dvecInData22, dvecInData21, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+            /* load 1st row of coeffs for both output channels */
+            IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+            IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* rearrange the coeff vectors. Separate even and odd coeff
+             * so that MUL4T can be used
+             */
+            IVP_DSEL2NX8I(dvecCoeffData2, dvecCoeffData1,
+                          dvecCoeffData2, dvecCoeffData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+            /* multiply 1st input row with 1st 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+            /* multiply 2nd input row with 1st 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+            /* multiply 1st input row with 1st 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+
+            /* multiply 2nd input row with 1st 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+          }   /* for (ky = 0; ky < kSizeY; ky++)*/
+
+          /****************************** 3rd inCh ******************************/
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2 + \
+                                         stride * inDataPitch1 * enable2Row);
+
+          for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            IVP_LA2NX8_XP(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            IVP_LA2NX8_XP(dvecInData12, vaInData, pdvecIn1, \
+                          inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* loads Next(3rd) input row, corresponding to 2nd output row */
+
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            IVP_LA2NX8_XP(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+            IVP_LA2NX8_XP(dvecInData22, vaInData, pdvecIn2, \
+                          inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+            /* Arrange input vectors required for Quad multiply*/
+            IVP_DSEL2NX8I(dvecInData12, dvecInData11,
+                          dvecInData12, dvecInData11, IVP_DSELI_8B_DEINTERLEAVE_1);
+            IVP_DSEL2NX8I(dvecInData22, dvecInData21,
+                          dvecInData22, dvecInData21, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+            /* load 1st row of coeffs for both output channels */
+            IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+            IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            /* rearrange the coeff vectors. Separate even and odd coeff
+             * so that MUL4T can be used
+             */
+            IVP_DSEL2NX8I(dvecCoeffData2, dvecCoeffData1,
+                          dvecCoeffData2, dvecCoeffData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+            /* multiply 1st input row with 1st 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+            MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+            /* multiply 2nd input row with 1st 8 coeff from 1st output channel*/
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+            MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+            /* multiply 1st input row with 1st 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+            MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+
+            /* multiply 2nd input row with 1st 8 coeff from 2nd output channel*/
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+            MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+          }   /* for (ky = 0; ky < kSizeY; ky++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut11L, dvecOut12L, dvecOut21L, dvecOut22L;
+          xb_vec2Nx8 dvecOut11H, dvecOut12H, dvecOut21H, dvecOut22H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* variable length for output stores */
+          varLen = XT_MIN(vectorizationWidth, outW - x);
+
+          /* Storing the first output channel, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut11L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut11H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the first output channel, second row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut12L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2Row);
+          IVP_SAV2NX8_XP(dvecOut12H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2Row);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd output channel, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut21L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut21H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd output channel, 2nd row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                                outDataPitch1 * enable2Row) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut22L, vaOutData, pdvecOut, bytesPerPixel * \
+                         varLen * enable2ndCh * enable2Row);
+          IVP_SAV2NX8_XP(dvecOut22H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh * enable2Row);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+}
+
+/******************************************************************************************
+*   xaiConvolved(VQ)3D_S_MxNj2d1I8S8IX_MOW_WHD
+*  ***************************************************************************************/
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for MxN 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate MxN 3D dilated convolution function and MxN */
+/*               3D VQ dilated convolution function for U8 bit and S8 bit     */
+/*               input data with input stride equal to 2                      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is MxNxDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_MxNj2d1_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_MxNj2d1_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_MxNj2d1_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_MxNj2d1_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_MxNj2d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(coeffTile) <= 16, XAI_ERR_KSIZE,           \
+                    "Kernel width = %d, which should be less than or equal to 16", \
+                    XAI_TILE4D_GET_DIM1(coeffTile));
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(coeffTile) <= 16, XAI_ERR_KSIZE,              \
+                    "\nKernel height = %d, which should be less than or equal to 16", \
+                    XAI_TILE4D_GET_DIM2(coeffTile));
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_STRIDE(param, 2);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+  if (XAI_TILE3D_GET_DIM3(inTile) == 3)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_MxNj2d1), S8IX_MOW_WHD_DEPTH3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \
+                      XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile);
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeX = XAI_TILE4D_GET_DIM1(coeffTile);
+  const int32_t kSizeY = XAI_TILE4D_GET_DIM2(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Pitches of Coefficient Data (WHDN) dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  uint8_t leftEdge, topEdge;
+  if ((kSizeX % 2) != 0)
+  {
+    leftEdge = kSizeX / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kSizeX / 2) : ((kSizeX / 2) - 1);
+  }
+
+  if ((kSizeY % 2) != 0)
+  {
+    topEdge = kSizeY / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kSizeY / 2) : ((kSizeY / 2) - 1);
+  }
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8* restrict pdvecIn1;
+  MORPH_IDT_2Nx8* restrict pdvecIn2;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+  int32_t varLen;
+
+  /* Number of output elements that can be generated
+   * with 2 input vector loads(64 way).*/
+  const int32_t vectorizationWidth = (((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeX) / stride) + 1;
+
+  if (kSizeX > 8)
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)    /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* In order to handle odd output height  */
+        int32_t enable2Row = XT_SALT(y, outH - 1);
+
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)];
+
+        /* initialize coeff and Bias data pointer */
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* In order to handle odd output depth  */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            xb_vec2Nx8 dvecCoeffData1;
+            xb_vec2Nx8 dvecCoeffData2;
+
+            xb_vec2Nx8 dvecInData11, dvecInData12;
+            xb_vec2Nx8 dvecInData21, dvecInData22;
+
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+            pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + \
+                                           stride * inDataPitch1 * enable2Row);
+
+            for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+
+              valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              IVP_LA2NX8_XP(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LA2NX8_XP(dvecInData12, vaInData, pdvecIn1, \
+                            inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              /* loads Next(3rd) input row, corresponding to 2nd output row */
+
+              vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+              IVP_LA2NX8_XP(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LA2NX8_XP(dvecInData22, vaInData, pdvecIn2, \
+                            inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              /* Re-arrange the data in the desired format                                    */
+              /* Assume input as 1,2,3,4,5,6,7...127                                          */
+              /* After re-arrangement using DSEL operation, updated vectors would be */
+              /* dvecInData1 : 1,  3,  5,...121                                              */
+              /* dvecInData2 : 2,  4,  6,...122                                              */
+
+              IVP_DSEL2NX8I(dvecInData12, dvecInData11,
+                            dvecInData12, dvecInData11, IVP_DSELI_8B_DEINTERLEAVE_1);
+              IVP_DSEL2NX8I(dvecInData22, dvecInData21,
+                            dvecInData22, dvecInData21, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+              /* load 1st row of coeffs for both output channels */
+              IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+              IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+              /* rearrange the coeff vectors. Separate even and odd coeff
+               * so that MUL4T can be used
+               */
+              IVP_DSEL2NX8I(dvecCoeffData2, dvecCoeffData1,
+                            dvecCoeffData2, dvecCoeffData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+              /* multiply 1st input row with 1st 8 coeff from 1st output channel*/
+              MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+              MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+              /* multiply 2nd input row with 1st 8 coeff from 1st output channel*/
+              MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+              MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+              /* multiply 1st input row with 1st 8 coeff from 2nd output channel*/
+              MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+              MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+
+              /* multiply 2nd input row with 1st 8 coeff from 2nd output channel*/
+              MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+              MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+
+              /* right rotate the input vectors
+               * in order to multiply with next columns of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+
+              /* multiply 1st input row with next 8 coeff from 1st output channel*/
+              MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+              MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+              /* multiply 2nd input row with next 8 coeff from 1st output channel*/
+              MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+              MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+
+              /* multiply 1st input row with next 8 coeff from 2nd output channel*/
+              MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9));
+              MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9));
+
+              /* multiply 2nd input row with next 8 coeff from 2nd output channel*/
+              MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9));
+              MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9));
+            }   /* for (ky = 0; ky < kSizeY; ky++)*/
+          }     /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut11L, dvecOut12L, dvecOut21L, dvecOut22L;
+          xb_vec2Nx8 dvecOut11H, dvecOut12H, dvecOut21H, dvecOut22H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* variable length for output stores */
+          varLen = XT_MIN(vectorizationWidth, outW - x);
+
+          /* Storing the first output channel, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut11L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut11H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the first output channel, second row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut12L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2Row);
+          IVP_SAV2NX8_XP(dvecOut12H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2Row);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd output channel, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut21L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut21H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd output channel, 2nd row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                                outDataPitch1 * enable2Row) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut22L, vaOutData, pdvecOut, bytesPerPixel * \
+                         varLen * enable2ndCh * enable2Row);
+          IVP_SAV2NX8_XP(dvecOut22H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh * enable2Row);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)    /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* In order to handle odd output height */
+        int32_t enable2Row = XT_SALT(y, outH - 1);
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)];
+
+        /* initialize coeff and Bias data pointer */
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* In order to handle odd output depth */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            xb_vec2Nx8 dvecCoeffData1;
+            xb_vec2Nx8 dvecCoeffData2;
+
+            xb_vec2Nx8 dvecInData11, dvecInData12;
+            xb_vec2Nx8 dvecInData21, dvecInData22;
+
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+            pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + \
+                                           stride * inDataPitch1 * enable2Row);
+
+            for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+
+              valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              IVP_LA2NX8_XP(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LA2NX8_XP(dvecInData12, vaInData, pdvecIn1, \
+                            inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              /* loads Next(3rd) input row, corresponding to 2nd output row */
+
+              vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+              IVP_LA2NX8_XP(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LA2NX8_XP(dvecInData22, vaInData, pdvecIn2, \
+                            inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              /* Re-arrange the data in the desired format                                    */
+              /* Assume input as 1,2,3,4,5,6,7...127                                          */
+              /* After re-arrangement using DSEL operation, updated vectors would be */
+              /* dvecInData1 : 1,  3,  5,...121                                              */
+              /* dvecInData2 : 2,  4,  6,...122                                              */
+
+              IVP_DSEL2NX8I(dvecInData12, dvecInData11,
+                            dvecInData12, dvecInData11, IVP_DSELI_8B_DEINTERLEAVE_1);
+              IVP_DSEL2NX8I(dvecInData22, dvecInData21,
+                            dvecInData22, dvecInData21, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+              /* load 1st row of coeffs for both output channels */
+              IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+              IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+              /* rearrange the coeff vectors. Separate even and odd coeff
+               * so that MUL4T can be used
+               */
+              IVP_DSEL2NX8I(dvecCoeffData2, dvecCoeffData1,
+                            dvecCoeffData2, dvecCoeffData1, IVP_DSELI_8B_DEINTERLEAVE_1);
+
+
+              /* multiply 1st input row with 1st 8 coeff from 1st output channel*/
+              MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+              MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+              /* multiply 2nd input row with 1st 8 coeff from 1st output channel*/
+              MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+              MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+              /* multiply 1st input row with 1st 8 coeff from 2nd output channel*/
+              MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+              MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+
+              /* multiply 2nd input row with 1st 8 coeff from 2nd output channel*/
+              MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8));
+              MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \
+                                IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8));
+            }   /* for (ky = 0; ky < kSizeY; ky++)*/
+          }     /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut11L, dvecOut12L, dvecOut21L, dvecOut22L;
+          xb_vec2Nx8 dvecOut11H, dvecOut12H, dvecOut21H, dvecOut22H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* variable length for output stores */
+          varLen = XT_MIN(vectorizationWidth, outW - x);
+
+          /* Storing the first output channel, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut11L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut11H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the first output channel, second row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut12L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2Row);
+          IVP_SAV2NX8_XP(dvecOut12H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2Row);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd output channel, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut21L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut21H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the 2nd output channel, 2nd row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                                outDataPitch1 * enable2Row) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut22L, vaOutData, pdvecOut, bytesPerPixel * \
+                         varLen * enable2ndCh * enable2Row);
+          IVP_SAV2NX8_XP(dvecOut22H, vaOutData, pdvecOut, typeFlag * \
+                         2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh * enable2Row);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* for (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+* MxN MOW WHD Stride 4 - DEPTH 3                                                          *
+* If number of input channels is equal to 3                                               *
+* this function is called.                                                                *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_MxNj4d1), S8IX_MOW_WHD_DEPTH3) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeX = XAI_TILE4D_GET_DIM1(coeffTile);
+  const int32_t kSizeY = XAI_TILE4D_GET_DIM2(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Pitches of Coefficient Data (WHDN) dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  uint8_t leftEdge, topEdge;
+  if ((kSizeX % 2) != 0)
+  {
+    leftEdge = kSizeX / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kSizeX / 2) : ((kSizeX / 2) - 1);
+  }
+
+  if ((kSizeY % 2) != 0)
+  {
+    topEdge = kSizeY / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kSizeY / 2) : ((kSizeY / 2) - 1);
+  }
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 *restrict pdvecIn1;
+  MORPH_IDT_2Nx8 *restrict pdvecIn2;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2, \
+  * restrict pdvecCoeff3, * restrict pdvecCoeff4;
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky;
+  int32_t varLen;
+
+  /* Number of output elements that can be generated
+   * with 2 input vector loads(64 way).*/
+  const int32_t vectorizationWidth = (((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeX) / stride) + 1;
+
+  /* Vearable count to handle the last iteration
+   * of X loop seprately if only 1 i/p load is
+   * sufficient
+   */
+  const int32_t remX = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeX) / stride) + 1;
+
+  /* generates the shuffle sequence for the coeff, so that MUL4T can be used.
+   * Rearranges coeff from c0,c1,..c13,c14 in the following manner:
+   *
+   * c0,c4,c8,c12
+   * c1,c5,c9,c13
+   * c2,c6,c10,c14
+   * c3,c7,c11,0
+   * */
+  xb_vec2Nx8 dvecIdx = IVP_SEQ2NX8();
+  xb_vec2Nx8 dvec1, dvec2;
+  IVP_DSEL2NX8I(dvec2, dvec1, 0, dvecIdx, IVP_DSELI_8B_DEINTERLEAVE_1);
+  IVP_DSEL2NX8I(dvecIdx, dvec1, 0, dvec1, IVP_DSELI_8B_DEINTERLEAVE_1);
+  dvec1 = IVP_SEL2NX8I(dvecIdx, dvec1, IVP_SELI_8B_INTERLEAVE_4_LO);
+  IVP_DSEL2NX8I(dvecIdx, dvec2, 0, dvec2, IVP_DSELI_8B_DEINTERLEAVE_1);
+  dvec2   = IVP_SEL2NX8I(dvecIdx, dvec2, IVP_SELI_8B_INTERLEAVE_4_LO);
+  dvecIdx = IVP_SEL2NX8I(dvec2, dvec1, IVP_SELI_8B_INTERLEAVE_4_LO);
+
+  /* loop across output depth is unrolled by 4
+   * , producing lanes from 4 output channels
+   * in one iteration. Since vectorization width
+   * is just half the width of the accumulator,
+   * loop across output height is also unrolled by 2.
+   * Unrolling across output height makes it possible
+   * to utilize all the 64 MACs in the accumulator.
+   *
+   * Data loaded from the 2 input rows is concatenated
+   * in such a manner that lower half of the output
+   * vector gives the first output row and the upper
+   * half of the */
+  for (x = 0; x < outW - remX; x += vectorizationWidth)   /* Loop across Output width */
+  {
+    for (y = 0; y < outH; y += 2)     /* Loop across Output height */
+    {
+      /* In order to handle odd height*/
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x];
+
+      /* initialize coeff and Bias data pointer to */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 4)    /* Loop across Output depth */
+      {
+        /* In order to handle odd depths*/
+
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+        int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+        /* loads and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+        xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+        dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+        IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh);
+        valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3);
+
+        pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh);
+        valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4);
+
+        /* variable declarations for input and coeff vectors */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3, dvecCoeffData4;
+
+        MORPH_IDT_2Nx8 dvecInData11, dvecInData12;
+        MORPH_IDT_2Nx8 dvecInData21, dvecInData22;
+
+        MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+        /************************** 1st inCh **************************/
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+        pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + stride * inDataPitch1 * enable2ndRow);
+
+        for (ky = 0; ky < kSizeY; ky++)    /* Loop across kernel height */
+        {
+          /* loads 1st input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8_IP(dvecInData11, vaInData, pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \
+                             2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* loads 5th(corresponding to the 2nd output row) input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8_IP(dvecInData21, vaInData, pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \
+                             2 * XCHAL_IVPN_SIMD_WIDTH);
+
+
+          /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+           * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is
+           * 128,129,130,131.........252,253,254,255, Data should be arranged  as
+           *
+           * dvecData1 : 0, 4, 8,...120,124,128,132,136,...248,252
+           * dvecData2 : 1, 5, 9,...121,125,129,133,137,...249,253
+           * dvecData3 : 2, 6,10,...122,126,130,134,138,...250,254
+           * dvecData4 : 3, 7,11,...123,127,131,135,139,...251,255
+           *
+           * Lower half of the vectors contain data from 1st input row and
+           * upper half of the vectors contain data from 2nd output row.
+           *
+           */
+
+          IVP_DSEL2NX8I(dvecData2, dvecData1,
+                        IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData3,
+                        IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load 1 row of coeff for all the the 4 output channels */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1);
+
+          /* shuffles the coeff in desired manner */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+          dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx);
+          dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx);
+
+          /* mulitples coeff c0,c4,c8,c12 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0));
+
+          /* mulitples coeff c1,c5,c9,c13 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1));
+
+          /* mulitples coeff c2,c6,c10,c14 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2));
+
+          /* mulitples coeff c3,c7,c11,0 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3));
+        }  /* end of for (ky = 0; ky < kSizeY; ky++)*/
+
+        /************************** 2nd inCh **************************/
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2);
+        pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2 + \
+                                       stride * inDataPitch1 * enable2ndRow);
+
+        for (ky = 0; ky < kSizeY; ky++)    /* Loop across kernel height */
+        {
+          /* loads 1st input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8_IP(dvecInData11, vaInData, pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \
+                             2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* loads 5th(corresponding to the 2nd output row) input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8_IP(dvecInData21, vaInData, pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \
+                             2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* Arrange input vectors required for Quad multiply*/
+          IVP_DSEL2NX8I(dvecData2, dvecData1,
+                        IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData3,
+                        IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load 1 row of coeff for all the the 4 output channels */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1);
+
+          /* shuffles the coeff in desired manner */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+          dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx);
+          dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx);
+
+          /* mulitples coeff c0,c4,c8,c12 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0));
+
+          /* mulitples coeff c1,c5,c9,c13 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1));
+
+          /* mulitples coeff c2,c6,c10,c14 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2));
+
+          /* mulitples coeff c3,c7,c11,0 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3));
+        }  /* end of for (ky = 0; ky < kSizeY; ky++)*/
+
+        /************************** 3rd inCh **************************/
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+        pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2 + \
+                                       stride * inDataPitch1 * enable2ndRow);
+
+        for (ky = 0; ky < kSizeY; ky++)    /* Loop across kernel height */
+        {
+          /* loads 1st input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8_IP(dvecInData11, vaInData, pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \
+                             2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* loads 5th(corresponding to the 2nd output row) input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8_IP(dvecInData21, vaInData, pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \
+                             2 * XCHAL_IVPN_SIMD_WIDTH);
+
+          /* Arrange input vectors required for Quad multiply*/
+          IVP_DSEL2NX8I(dvecData2, dvecData1,
+                        IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData3,
+                        IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load 1 row of coeff for all the the 4 output channels */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1);
+
+          /* shuffles the coeff in desired manner */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+          dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx);
+          dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx);
+
+          /* mulitples coeff c0,c4,c8,c12 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0));
+
+          /* mulitples coeff c1,c5,c9,c13 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1));
+
+          /* mulitples coeff c2,c6,c10,c14 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2));
+
+          /* mulitples coeff c3,c7,c11,0 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3));
+        }  /* end of for (ky = 0; ky < kSizeY; ky++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable store count */
+        varLen = XT_MIN(outW - x, vectorizationWidth);
+
+        /* store the first half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4
+         */
+
+        /* Storing the first row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* extract the half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4
+         * and store in the next row
+         */
+
+        /* Storing the 2nd row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable2ndCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable2ndCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable3rdCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable3rdCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut4L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable4thCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable4thCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 4 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 4 * coeffPitch3;
+      }  /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/
+    }    /* end of for (y = 0; y < outH; y += 2)*/
+  }      /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  if (x < outW)
+  {
+    for (y = 0; y < outH; y += 2)     /* Loop across Output height */
+    {
+      /* In order to handle odd height*/
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x];
+
+      /* initialize coeff and Bias data pointer to */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 4)    /* Loop across Output depth */
+      {
+        /* In order to handle odd depths*/
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+        int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+        /* loads and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+        xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+        dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+        IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh);
+        valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3);
+
+        pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh);
+        valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4);
+
+        /* variable declarations for input and coeff vectors */
+        xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3, dvecCoeffData4;
+
+        MORPH_IDT_2Nx8 dvecInData11;
+        MORPH_IDT_2Nx8 dvecInData21;
+
+        MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+        /**************************** 1st inCh ***************************/
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput);
+        pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + \
+                                       stride * inDataPitch1 * enable2ndRow);
+
+        for (ky = 0; ky < kSizeY; ky++)    /* Loop across kernel height */
+        {
+          /* loads 1st input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, inDataPitch1);
+
+          /* loads 5th(corresponding to the 2nd output row) input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, inDataPitch1);
+
+
+          /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+           * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is
+           * 128,129,130,131.........252,253,254,255, Data should be arranged  as
+           *
+           * dvecData1 : 0, 4, 8,...120,124,128,132,136,...248,252
+           * dvecData2 : 1, 5, 9,...121,125,129,133,137,...249,253
+           * dvecData3 : 2, 6,10,...122,126,130,134,138,...250,254
+           * dvecData4 : 3, 7,11,...123,127,131,135,139,...251,255
+           *
+           * Lower half of the vectors contain data from 1st input row and
+           * upper half of the vectors contain data from 2nd output row.
+           *
+           */
+
+          IVP_DSEL2NX8I(dvecData2, dvecData1,
+                        IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData3,
+                        IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load 1 row of coeff for all the the 4 output channels */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1);
+
+          /* shuffles the coeff in desired manner */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+          dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx);
+          dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx);
+
+          /* mulitples coeff c0,c4,c8,c12 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0));
+
+          /* mulitples coeff c1,c5,c9,c13 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1));
+
+          /* mulitples coeff c2,c6,c10,c14 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2));
+
+          /* mulitples coeff c3,c7,c11,0 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3));
+        }  /* end of for (ky = 0; ky < kSizeY; ky++)*/
+
+        /**************************** 2nd inCh ***************************/
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2);
+        pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2 + \
+                                       stride * inDataPitch1 * enable2ndRow);
+
+        for (ky = 0; ky < kSizeY; ky++)    /* Loop across kernel height */
+        {
+          /* loads 1st input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, inDataPitch1);
+
+          /* loads 5th(corresponding to the 2nd output row) input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, inDataPitch1);
+
+          /* Arrange input vectors required for Quad multiply*/
+          IVP_DSEL2NX8I(dvecData2, dvecData1,
+                        IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData3,
+                        IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load 1 row of coeff for all the the 4 output channels */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1);
+
+          /* shuffles the coeff in desired manner */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+          dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx);
+          dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx);
+
+          /* mulitples coeff c0,c4,c8,c12 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0));
+
+          /* mulitples coeff c1,c5,c9,c13 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1));
+
+          /* mulitples coeff c2,c6,c10,c14 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2));
+
+          /* mulitples coeff c3,c7,c11,0 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3));
+        }  /* end of for (ky = 0; ky < kSizeY; ky++)*/
+
+        /**************************** 3rd inCh ***************************/
+        pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2);
+        pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2 + \
+                                       stride * inDataPitch1 * enable2ndRow);
+
+        for (ky = 0; ky < kSizeY; ky++)    /* Loop across kernel height */
+        {
+          /* loads 1st input row */
+          valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+          MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, inDataPitch1);
+
+          /* loads 5th(corresponding to the 2nd output row) input row */
+          vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+          MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, inDataPitch1);
+
+          /* Arrange input vectors required for Quad multiply*/
+          IVP_DSEL2NX8I(dvecData2, dvecData1,
+                        IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+          IVP_DSEL2NX8I(dvecData4, dvecData3,
+                        IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                        IVP_DSELI_8B_DEINTERLEAVE_1);
+
+          /* load 1 row of coeff for all the the 4 output channels */
+          IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1);
+          IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1);
+
+          /* shuffles the coeff in desired manner */
+          dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+          dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+          dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx);
+          dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx);
+
+          /* mulitples coeff c0,c4,c8,c12 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData1, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0));
+
+          /* mulitples coeff c1,c5,c9,c13 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData2, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1));
+
+          /* mulitples coeff c2,c6,c10,c14 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData3, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2));
+
+          /* mulitples coeff c3,c7,c11,0 with input data */
+          MORPH_OP_MUL4TA(dacc1, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+          MORPH_OP_MUL4TA(dacc2, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+          MORPH_OP_MUL4TA(dacc3, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3));
+          MORPH_OP_MUL4TA(dacc4, 0, dvecData4, IVP_EXTRN_2X32( \
+                            IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3));
+        }  /* end of for (ky = 0; ky < kSizeY; ky++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable store count */
+        varLen = XT_MIN(outW - x, vectorizationWidth);
+
+        /* store the first half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4
+         */
+
+        /* Storing the first row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* extract the half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4
+         * and store in the next row
+         */
+
+        /* Storing the 2nd row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable2ndCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable2ndCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable3rdCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable3rdCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut4L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable4thCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable4thCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 4 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 4 * coeffPitch3;
+      }  /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/
+    }    /* end of for (y = 0; y < outH; y += 2)*/
+  }
+}
+
+/******************************************************************************************
+   convolved3D_S_11x11j4d1_S8S8IX_MOW_WHD
+   convolved3D_S_11x11j4d1_U8S8IX_MOW_WHD
+   convolvedVQ3D_S_11x11j4d1_S8S8IX_MOW_WHD
+   convolvedVQ3D_S_11x11j4d1_U8S8IX_MOW_WHD
+* 11x11 MOW WHD Stride 4 dilation -1                                                      *
+******************************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_11x11j4d1), S8IX_MOW_WHD) \
+  MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+
+  /* Pitches of Coefficient Data (WHDN) dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 *restrict pdvecIn1;
+  MORPH_IDT_2Nx8 *restrict pdvecIn2;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1, *restrict pdvecCoeff2, \
+  * restrict pdvecCoeff3, *restrict pdvecCoeff4;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+  int32_t varLen;
+
+  /* variable declarations for input and coeff vectors */
+  MORPH_IDT_2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3;
+  MORPH_IDT_2Nx8 dvecCoeffData4, dvecInData11, dvecInData12;
+  MORPH_IDT_2Nx8 dvecInData21, dvecInData22;
+  MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+  MORPH_IDT_2Nx8 dvecData5, dvecData6, dvecData7, dvecData8;
+  MORPH_IDT_2Nx8 dvecData9, dvecData10, dvecData11;
+
+  /* Number of output elements that can be generated
+   * with 2 input vector loads(64 way).*/
+  const int32_t vectorizationWidth = (((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1;
+
+  /* Vearable count to handle the last iteration
+   * of X loop seprately if only 1 i/p load is
+   * sufficient
+   */
+  const int32_t remX = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1;
+
+  /* loop across output depth is unrolled by 4
+   * , producing lanes from 4 output channels
+   * in one iteration. Since vectorization width
+   * is just half the width of the accumulator,
+   * loop across output height is also unrolled by 2.
+   * Unrolling across output height makes it possible
+   * to utilize all the 64 MACs in the accumulator.
+   *
+   * Data loaded from the 2 input rows is concatenated
+   * in such a manner that lower half of the output
+   * vector gives the first output row and the upper
+   * half of the */
+  for (x = 0; x < outW - remX; x += vectorizationWidth)   /* Loop across Output width */
+  {
+    for (y = 0; y < outH; y += 2)     /* Loop across Output height */
+    {
+      /* In order to handle odd height*/
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x];
+
+      /* initialize coeff and Bias data pointer to */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 4)    /* Loop across Output depth */
+      {
+        /* In order to handle odd depths*/
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+        int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+        /* loads and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+        xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+        dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+        IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh);
+        pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh);
+
+        for (inCh = 0; inCh < numInCh; inCh++)    /* Loop across input channels */
+        {
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + \
+                                         stride * inDataPitch1 * enable2ndRow);
+
+          for (ky = 0; ky < 11; ky++)    /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8_IP(dvecInData11, vaInData, pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1,
+                               inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+            /* loads 5th(corresponding to the 2nd output row) input row */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8_IP(dvecInData21, vaInData, pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2,
+                               inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+            /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+             * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is
+             * 128,129,130,131.........252,253,254,255, Data should be arranged  as
+             *
+             * dvecData1 : 0, 4, 8,...120,124,128,132,136,...248,252
+             * dvecData2 : 1, 5, 9,...121,125,129,133,137,...249,253
+             * dvecData3 : 2, 6,10,...122,126,130,134,138,...250,254
+             * dvecData4 : 3, 7,11,...123,127,131,135,139,...251,255
+             *
+             * Lower half of the vectors contain data from 1st input row and
+             * upper half of the vectors contain data from 2nd output row.
+             *
+             */
+            IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                          IVP_DSELI_8B_DEINTERLEAVE_1);
+            IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \
+                          IVP_DSELI_8B_DEINTERLEAVE_1);
+            IVP_DSEL2NX8I(dvecData3, dvecData1, dvecInData21, dvecInData11, \
+                          IVP_DSELI_8B_DEINTERLEAVE_1);
+            IVP_DSEL2NX8I(dvecData4, dvecData2, dvecInData22, dvecInData12, \
+                          IVP_DSELI_8B_DEINTERLEAVE_1);
+
+            dvecData5 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+            dvecData6 = IVP_SEL2NX8I(0, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+            dvecData7 = IVP_SEL2NX8I(0, dvecData3, IVP_SELI_8B_ROTATE_RIGHT_1);
+            dvecData8 = IVP_SEL2NX8I(0, dvecData4, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+            dvecData9  = IVP_SEL2NX8I(0, dvecData5, IVP_SELI_8B_ROTATE_RIGHT_1);
+            dvecData10 = IVP_SEL2NX8I(0, dvecData6, IVP_SELI_8B_ROTATE_RIGHT_1);
+            dvecData11 = IVP_SEL2NX8I(0, dvecData7, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+            /* load 1 row of coeff for all the the 4 output channels */
+            valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+            IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+            valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+            IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3);
+            IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1);
+
+            valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4);
+            IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1);
+
+            /* multiplies data from two rows(Lower and upper half of dvecData)
+             * with coeff from 1st output channel and accumulate */
+
+            /* IVP_EXTRVRN_2X32 extracts the required coeff from the
+             * coeff vector. In every iteration ky is updated therefore it
+             * extracts coeff from the next coeff row in the successive ky
+             * iterations. "ky * coeffPitch1 + 4" extracts coeffs next to
+             * first four coeff in a row
+             */
+            MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+            MORPH_OP_MULQA(dacc1, dvecData8, dvecData7, dvecData6, dvecData5, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+            MORPH_OP_MULQA(dacc1, 0, dvecData11, dvecData10, dvecData9, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+            /* multiplies data from two rows(Lower and upper half of dvecData)
+             * with coeff from 2nd output channel and accumulate */
+            MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+            MORPH_OP_MULQA(dacc2, dvecData8, dvecData7, dvecData6, dvecData5, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+            MORPH_OP_MULQA(dacc2, 0, dvecData11, dvecData10, dvecData9, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+
+            /* multiplies data from two rows(Lower and upper half of dvecData)
+             * with coeff from 3rd output channel and accumulate */
+            MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+
+            MORPH_OP_MULQA(dacc3, dvecData8, dvecData7, dvecData6, dvecData5, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+
+            MORPH_OP_MULQA(dacc3, 0, dvecData11, dvecData10, dvecData9, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+
+            /* multiplies data from two rows(Lower and upper half of dvecData)
+             * with coeff from 4th output channel and accumulate */
+            MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0));
+
+            MORPH_OP_MULQA(dacc4, dvecData8, dvecData7, dvecData6, dvecData5, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1));
+
+            MORPH_OP_MULQA(dacc4, 0, dvecData11, dvecData10, dvecData9, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2));
+          }  /* end of for (ky = 0; ky < kSizeY; ky++)*/
+        }    /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable store count */
+        varLen = XT_MIN(outW - x, vectorizationWidth);
+
+        /* store the first half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4
+         */
+
+        /* Storing the first row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* extract the half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4
+         * and store in the next row
+         */
+
+        /* Storing the 2nd row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable2ndCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable2ndCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable3rdCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable3rdCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut4L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable4thCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable4thCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 4 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 4 * coeffPitch3;
+      }  /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/
+    }    /* end of for (y = 0; y < outH; y += 2)*/
+  }      /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+
+  if (x < outW)
+  {
+    for (y = 0; y < outH; y += 2)     /* Loop across Output height */
+    {
+      /* In order to handle odd height*/
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x];
+
+      /* initialize coeff and Bias data pointer to */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 4)    /* Loop across Output depth */
+      {
+        /* In order to handle odd depths*/
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+        int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+        /* loads and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+        xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+        dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+        IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh);
+        pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh);
+
+        for (inCh = 0; inCh < numInCh; inCh++)    /* Loop across input channels */
+        {
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + \
+                                         stride * inDataPitch1 * enable2ndRow);
+
+          for (ky = 0; ky < 11; ky++)    /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, inDataPitch1);
+
+            /* loads 5th(corresponding to the 2nd output row) input row */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, inDataPitch1);
+
+
+            /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+             * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...63, and the 2nd input row is
+             * 64, 65, 66, 67.........124,125,126,127, Data should be arranged  as
+             *
+             * dvecData1 : 0, 4, 8,...120,124
+             * dvecData2 : 1, 5, 9,...121,125
+             * dvecData3 : 2, 6,10,...122,126
+             * dvecData4 : 3, 7,11,...123,127
+             *
+             * Lower half of the vectors contain data from 1st input row and
+             * upper half of the vectors contain data from 2nd output row.
+             *
+             */
+            IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData11, dvecInData11, \
+                          IVP_DSELI_8B_DEINTERLEAVE_1);
+
+            IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData21, dvecInData21, \
+                          IVP_DSELI_8B_DEINTERLEAVE_1);
+
+            IVP_DSEL2NX8I(dvecData3, dvecData1, dvecInData21, dvecInData11, \
+                          IVP_DSELI_8B_DEINTERLEAVE_1);
+
+            IVP_DSEL2NX8I(dvecData4, dvecData2, dvecInData22, dvecInData12, \
+                          IVP_DSELI_8B_DEINTERLEAVE_1);
+
+            dvecData5 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1);
+            dvecData6 = IVP_SEL2NX8I(0, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1);
+            dvecData7 = IVP_SEL2NX8I(0, dvecData3, IVP_SELI_8B_ROTATE_RIGHT_1);
+            dvecData8 = IVP_SEL2NX8I(0, dvecData4, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+            dvecData9  = IVP_SEL2NX8I(0, dvecData5, IVP_SELI_8B_ROTATE_RIGHT_1);
+            dvecData10 = IVP_SEL2NX8I(0, dvecData6, IVP_SELI_8B_ROTATE_RIGHT_1);
+            dvecData11 = IVP_SEL2NX8I(0, dvecData7, IVP_SELI_8B_ROTATE_RIGHT_1);
+
+            /* load 1 row of coeff for all the the 4 output channels */
+            valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+            IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+            valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+            IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+            valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3);
+            IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1);
+
+            valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4);
+            IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1);
+
+            /* multiplies data from two rows(Lower and upper half of dvecData)
+             * with coeff from 1st output channel and accumulate */
+
+            /* IVP_EXTRVRN_2X32 extracts the required coeff from the
+             * coeff vector. In every iteration ky is updated therefore it
+             * extracts coeff from the next coeff row in the successive ky
+             * iterations. "ky * coeffPitch1 + 4" extracts coeffs next to
+             * first four coeff in a row
+             */
+            MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+
+            MORPH_OP_MULQA(dacc1, dvecData8, dvecData7, dvecData6, dvecData5, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+
+            MORPH_OP_MULQA(dacc1, 0, dvecData11, dvecData10, dvecData9, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+
+            /* multiplies data from two rows(Lower and upper half of dvecData)
+             * with coeff from 2nd output channel and accumulate */
+            MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+
+            MORPH_OP_MULQA(dacc2, dvecData8, dvecData7, dvecData6, dvecData5, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+
+            MORPH_OP_MULQA(dacc2, 0, dvecData11, dvecData10, dvecData9, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+
+            /* multiplies data from two rows(Lower and upper half of dvecData)
+             * with coeff from 3rd output channel and accumulate */
+            MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+
+            MORPH_OP_MULQA(dacc3, dvecData8, dvecData7, dvecData6, dvecData5, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+
+            MORPH_OP_MULQA(dacc3, 0, dvecData11, dvecData10, dvecData9, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+
+            /* multiplies data from two rows(Lower and upper half of dvecData)
+             * with coeff from 4th output channel and accumulate */
+            MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0));
+
+            MORPH_OP_MULQA(dacc4, dvecData8, dvecData7, dvecData6, dvecData5, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1));
+
+            MORPH_OP_MULQA(dacc4, 0, dvecData11, dvecData10, dvecData9, IVP_EXTRN_2X32 \
+                             (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2));
+          }  /* end of for (ky = 0; ky < kSizeY; ky++)*/
+        }      /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable store count */
+        varLen = XT_MIN(outW - x, vectorizationWidth);
+
+        /* store the first half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4
+         */
+
+        /* Storing the first row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* extract the half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4
+         * and store in the next row
+         */
+
+        /* Storing the 2nd row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable2ndCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable2ndCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable3rdCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable3rdCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut4L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable4thCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable4thCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 4 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 4 * coeffPitch3;
+      }  /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/
+    }    /* end of for (y = 0; y < outH; y += 2)*/
+  }
+}
+
+/******************************************************************************************
+*   xaiConvolved(VQ)3D_S_MxNj4d1I8S8IX_MOW_WHD
+*  ***************************************************************************************/
+
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for MxN 3D convolution.  */
+/*               Based on MORPH pre-processor specifiers, code implementation */
+/*               is generated during preprocessing stage. This method can be  */
+/*               used to generate MxN 3D dilated convolution function and MxN */
+/*               3D VQ dilated convolution function for U8 bit and S8 bit     */
+/*               input data with input stride equal to 4                      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is MxNxDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_MxNj4d1_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_MxNj4d1_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_MxNj4d1_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_MxNj4d1_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_MxNj4d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(coeffTile) <= 16, XAI_ERR_KSIZE,           \
+                    "kernel width = %d, which should be less than or equal to 16", \
+                    XAI_TILE4D_GET_DIM1(coeffTile));
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(coeffTile) <= 16, XAI_ERR_KSIZE,               \
+                    "\nkernel height = %d,  which should be less than or equal to 16", \
+                    XAI_TILE4D_GET_DIM2(coeffTile));
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                         \
+                    XAI_ERR_BADARG, "\nDilation along width = %u and height = %u\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_STRIDE(param, 4);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                       \
+                    XAI_ERR_BADARG, "\nStride along width = %u and height = %u\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                   \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %u, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                             \
+                    XAI_ERR_NORM, "\nThe output shift = %u, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  if (XAI_TILE4D_GET_DIM1(coeffTile) == 11 && XAI_TILE4D_GET_DIM2(coeffTile) == 11 &&
+      XAI_CNN_CONV_GET_STRIDEX(param) == 4)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_11x11j4d1), S8IX_MOW_WHD) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+  if (XAI_TILE3D_GET_DIM3(inTile) == 3)
+  {
+    MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_MxNj4d1), S8IX_MOW_WHD_DEPTH3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param);
+    return(XAI_ERROR_STATUS());
+  }
+
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeX = XAI_TILE4D_GET_DIM1(coeffTile);
+  const int32_t kSizeY = XAI_TILE4D_GET_DIM2(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Pitches of Coefficient Data (WHDN) dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  uint8_t leftEdge, topEdge;
+  if ((kSizeX % 2) != 0)
+  {
+    leftEdge = kSizeX / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kSizeX / 2) : ((kSizeX / 2) - 1);
+  }
+
+  if ((kSizeY % 2) != 0)
+  {
+    topEdge = kSizeY / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kSizeY / 2) : ((kSizeY / 2) - 1);
+  }
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 *restrict pdvecIn1;
+  MORPH_IDT_2Nx8 *restrict pdvecIn2;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2, \
+  * restrict pdvecCoeff3, * restrict pdvecCoeff4;
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+  int32_t varLen;
+
+  /* Number of output elements that can be generated
+   * with 2 input vector loads(64 way).*/
+  const int32_t vectorizationWidth = (((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeX) / stride) + 1;
+
+  /* Vearable count to handle the last iteration
+   * of X loop seprately if only 1 i/p load is
+   * sufficient
+   */
+  const int32_t remX = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeX) / stride) + 1;
+
+  /* generates the shuffle sequence for the coeff, so that MUL4T can be used.
+   * Rearranges coeff from c0,c1,..c13,c14 in the following manner:
+   *
+   * c0,c4,c8,c12
+   * c1,c5,c9,c13
+   * c2,c6,c10,c14
+   * c3,c7,c11,0
+   * */
+  xb_vec2Nx8 dvecIdx = IVP_SEQ2NX8();
+  xb_vec2Nx8 dvec1, dvec2;
+  IVP_DSEL2NX8I(dvec2, dvec1, 0, dvecIdx, IVP_DSELI_8B_DEINTERLEAVE_1);
+  IVP_DSEL2NX8I(dvecIdx, dvec1, 0, dvec1, IVP_DSELI_8B_DEINTERLEAVE_1);
+  dvec1 = IVP_SEL2NX8I(dvecIdx, dvec1, IVP_SELI_8B_INTERLEAVE_4_LO);
+  IVP_DSEL2NX8I(dvecIdx, dvec2, 0, dvec2, IVP_DSELI_8B_DEINTERLEAVE_1);
+  dvec2   = IVP_SEL2NX8I(dvecIdx, dvec2, IVP_SELI_8B_INTERLEAVE_4_LO);
+  dvecIdx = IVP_SEL2NX8I(dvec2, dvec1, IVP_SELI_8B_INTERLEAVE_4_LO);
+
+  /* loop across output depth is unrolled by 4
+   * , producing lanes from 4 output channels
+   * in one iteration. Since vectorization width
+   * is just half the width of the accumulator,
+   * loop across output height is also unrolled by 2.
+   * Unrolling across output height makes it possible
+   * to utilize all the 64 MACs in the accumulator.
+   *
+   * Data loaded from the 2 input rows is concatenated
+   * in such a manner that lower half of the output
+   * vector gives the first output row and the upper
+   * half of the */
+  for (x = 0; x < outW - remX; x += vectorizationWidth)   /* Loop across Output width */
+  {
+    for (y = 0; y < outH; y += 2)     /* Loop across Output height */
+    {
+      /* In order to handle odd height*/
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x];
+
+      /* initialize coeff and Bias data pointer to */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 4)    /* Loop across Output depth */
+      {
+        /* In order to handle odd depths*/
+
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+        int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+        /* loads and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+        xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+        dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+        IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh);
+        valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3);
+
+        pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh);
+        valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4);
+
+        for (inCh = 0; inCh < numInCh; inCh++)    /* Loop across input channels */
+        {
+          /* variable declarations for input and coeff vectors */
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3, dvecCoeffData4;
+
+          MORPH_IDT_2Nx8 dvecInData11, dvecInData12;
+          MORPH_IDT_2Nx8 dvecInData21, dvecInData22;
+
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + \
+                                         stride * inDataPitch1 * enable2ndRow);
+
+          for (ky = 0; ky < kSizeY; ky++)    /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8_IP(dvecInData11, vaInData, pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+            /* loads 5th(corresponding to the 2nd output row) input row */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8_IP(dvecInData21, vaInData, pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+
+            /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+             * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is
+             * 128,129,130,131.........252,253,254,255, Data should be arranged  as
+             *
+             * dvecData1 : 0, 4, 8,...120,124,128,132,136,...248,252
+             * dvecData2 : 1, 5, 9,...121,125,129,133,137,...249,253
+             * dvecData3 : 2, 6,10,...122,126,130,134,138,...250,254
+             * dvecData4 : 3, 7,11,...123,127,131,135,139,...251,255
+             *
+             * Lower half of the vectors contain data from 1st input row and
+             * upper half of the vectors contain data from 2nd output row.
+             *
+             */
+
+            IVP_DSEL2NX8I(dvecData2, dvecData1,
+                          IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                          IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                          IVP_DSELI_8B_DEINTERLEAVE_1);
+            IVP_DSEL2NX8I(dvecData4, dvecData3,
+                          IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                          IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                          IVP_DSELI_8B_DEINTERLEAVE_1);
+
+            /* load 1 row of coeff for all the the 4 output channels */
+            IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+            IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+            IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1);
+            IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1);
+
+            /* shuffles the coeff in desired manner */
+            dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+            dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+            dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx);
+            dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx);
+
+            /* mulitples coeff c0,c4,c8,c12 with input data */
+            MORPH_OP_MUL4TA(dacc1, 0, dvecData1, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+            MORPH_OP_MUL4TA(dacc2, 0, dvecData1, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+            MORPH_OP_MUL4TA(dacc3, 0, dvecData1, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+            MORPH_OP_MUL4TA(dacc4, 0, dvecData1, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0));
+
+            /* mulitples coeff c1,c5,c9,c13 with input data */
+            MORPH_OP_MUL4TA(dacc1, 0, dvecData2, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+            MORPH_OP_MUL4TA(dacc2, 0, dvecData2, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+            MORPH_OP_MUL4TA(dacc3, 0, dvecData2, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+            MORPH_OP_MUL4TA(dacc4, 0, dvecData2, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1));
+
+            /* mulitples coeff c2,c6,c10,c14 with input data */
+            MORPH_OP_MUL4TA(dacc1, 0, dvecData3, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+            MORPH_OP_MUL4TA(dacc2, 0, dvecData3, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+            MORPH_OP_MUL4TA(dacc3, 0, dvecData3, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+            MORPH_OP_MUL4TA(dacc4, 0, dvecData3, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2));
+
+            /* mulitples coeff c3,c7,c11,0 with input data */
+            MORPH_OP_MUL4TA(dacc1, 0, dvecData4, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+            MORPH_OP_MUL4TA(dacc2, 0, dvecData4, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+            MORPH_OP_MUL4TA(dacc3, 0, dvecData4, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3));
+            MORPH_OP_MUL4TA(dacc4, 0, dvecData4, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3));
+          }  /* end of for (ky = 0; ky < kSizeY; ky++)*/
+        }      /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable store count */
+        varLen = XT_MIN(outW - x, vectorizationWidth);
+
+        /* store the first half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4
+         */
+
+        /* Storing the first row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* extract the half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4
+         * and store in the next row
+         */
+
+        /* Storing the 2nd row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable2ndCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable2ndCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable3rdCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable3rdCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut4L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable4thCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable4thCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 4 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 4 * coeffPitch3;
+      }  /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/
+    }    /* end of for (y = 0; y < outH; y += 2)*/
+  }      /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  if (x < outW)
+  {
+    for (y = 0; y < outH; y += 2)     /* Loop across Output height */
+    {
+      /* In order to handle odd height*/
+      int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+      /* initialize output data pointer */
+      int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+      /* initialize input data pointer */
+      MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x];
+
+      /* initialize coeff and Bias data pointer to */
+      int8_t *pCoeff = &pCoeffData[0];
+      int32_t *pBias = &pBiasData[0];
+
+      for (outCh = 0; outCh < numOutCh; outCh += 4)    /* Loop across Output depth */
+      {
+        /* In order to handle odd depths*/
+        int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+        int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2);
+        int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3);
+
+        /* loads and replicate bias data */
+        xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+        xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh);
+        xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh);
+        xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4);
+
+        /* wide vectors(accumulators) initialized with bias */
+        xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4;
+        dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+        IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1);
+        dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+        IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2);
+        dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3);
+        IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3);
+        dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4);
+        IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4);
+
+        /* priming of coeff load is done outside the innermost loop*/
+        pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+        valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+        pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+        valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+        pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh);
+        valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3);
+
+        pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh);
+        valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4);
+
+        for (inCh = 0; inCh < numInCh; inCh++)    /* Loop across input channels */
+        {
+          /* variable declarations for input and coeff vectors */
+          xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3, dvecCoeffData4;
+
+          MORPH_IDT_2Nx8 dvecInData11;
+          MORPH_IDT_2Nx8 dvecInData21;
+
+          MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4;
+
+          pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+          pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + \
+                                         stride * inDataPitch1 * enable2ndRow);
+
+          for (ky = 0; ky < kSizeY; ky++)    /* Loop across kernel height */
+          {
+            /* loads 1st input row */
+            valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+            MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, inDataPitch1);
+
+            /* loads 5th(corresponding to the 2nd output row) input row */
+            vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2);
+            MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, inDataPitch1);
+
+
+            /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+             * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is
+             * 128,129,130,131.........252,253,254,255, Data should be arranged  as
+             *
+             * dvecData1 : 0, 4, 8,...120,124,128,132,136,...248,252
+             * dvecData2 : 1, 5, 9,...121,125,129,133,137,...249,253
+             * dvecData3 : 2, 6,10,...122,126,130,134,138,...250,254
+             * dvecData4 : 3, 7,11,...123,127,131,135,139,...251,255
+             *
+             * Lower half of the vectors contain data from 1st input row and
+             * upper half of the vectors contain data from 2nd output row.
+             *
+             */
+
+            IVP_DSEL2NX8I(dvecData2, dvecData1,
+                          IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                          IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0),
+                          IVP_DSELI_8B_DEINTERLEAVE_1);
+            IVP_DSEL2NX8I(dvecData4, dvecData3,
+                          IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                          IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2),
+                          IVP_DSELI_8B_DEINTERLEAVE_1);
+
+            /* load 1 row of coeff for all the the 4 output channels */
+            IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+            IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+            IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1);
+            IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1);
+
+            /* shuffles the coeff in desired manner */
+            dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx);
+            dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx);
+            dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx);
+            dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx);
+
+            /* mulitples coeff c0,c4,c8,c12 with input data */
+            MORPH_OP_MUL4TA(dacc1, 0, dvecData1, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0));
+            MORPH_OP_MUL4TA(dacc2, 0, dvecData1, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0));
+            MORPH_OP_MUL4TA(dacc3, 0, dvecData1, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0));
+            MORPH_OP_MUL4TA(dacc4, 0, dvecData1, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0));
+
+            /* mulitples coeff c1,c5,c9,c13 with input data */
+            MORPH_OP_MUL4TA(dacc1, 0, dvecData2, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1));
+            MORPH_OP_MUL4TA(dacc2, 0, dvecData2, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1));
+            MORPH_OP_MUL4TA(dacc3, 0, dvecData2, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1));
+            MORPH_OP_MUL4TA(dacc4, 0, dvecData2, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1));
+
+            /* mulitples coeff c2,c6,c10,c14 with input data */
+            MORPH_OP_MUL4TA(dacc1, 0, dvecData3, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2));
+            MORPH_OP_MUL4TA(dacc2, 0, dvecData3, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2));
+            MORPH_OP_MUL4TA(dacc3, 0, dvecData3, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2));
+            MORPH_OP_MUL4TA(dacc4, 0, dvecData3, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2));
+
+            /* mulitples coeff c3,c7,c11,0 with input data */
+            MORPH_OP_MUL4TA(dacc1, 0, dvecData4, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3));
+            MORPH_OP_MUL4TA(dacc2, 0, dvecData4, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3));
+            MORPH_OP_MUL4TA(dacc3, 0, dvecData4, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3));
+            MORPH_OP_MUL4TA(dacc4, 0, dvecData4, IVP_EXTRN_2X32( \
+                              IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3));
+          }  /* end of for (ky = 0; ky < kSizeY; ky++)*/
+        }      /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+        /* Pack, Output Scale, Output Shift and clamping */
+        xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+        xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* variable store count */
+        varLen = XT_MIN(outW - x, vectorizationWidth);
+
+        /* store the first half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4
+         */
+
+        /* Storing the first row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput);
+        valign vaOutData = IVP_ZALIGN();
+        IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the first row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel);
+        IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* extract the half of the output vectors
+         * dvecOut1, dvecOut2, dvecOut3, dvecOut4
+         * and store in the next row
+         */
+
+        /* Storing the 2nd row outputs, first channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 2nd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable2ndCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable2ndCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 3rd channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable3rdCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable3rdCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        /* Storing the 2nd row outputs, 4th channel */
+        pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \
+                                              outDataPitch1 * enable2ndRow) * bytesPerPixel);
+        IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut4L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \
+                       (-typeFlag + 1) * varLen * enable4thCh * enable2ndRow);
+        IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                       varLen * enable4thCh * enable2ndRow);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+        pOutput += 4 * outDataPitch2 * bytesPerPixel;
+        pCoeff  += 4 * coeffPitch3;
+      }  /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/
+    }    /* end of for (y = 0; y < outH; y += 2)*/
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+*   xaiConvolved(VQ)3D_S_MxNj1d2I8S8IX_MOW_WHD
+*  ***************************************************************************************/
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for MxN 3D convolution   */
+/*               with dilation = 2. Based on MORPH pre-processor specifiers,  */
+/*               code implementation is generated during preprocessing stage. */
+/*               This method can be used to generate MxN 3D dialted           */
+/*               convolution function and MxN 3D VQ dialted convolution       */
+/*               function for U8 bit and S8 bit input data with input stride  */
+/*               equal to 1.                                                  */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is MxNxDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_MxNj1d2_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_MxNj1d2_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_MxNj1d2_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_MxNj1d2_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_MxNj1d2), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(coeffTile) <= 16, XAI_ERR_KSIZE,           \
+                    "Kernel width = %d, which should be less than or equal to 16", \
+                    XAI_TILE4D_GET_DIM1(coeffTile));
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(coeffTile) <= 16, XAI_ERR_KSIZE,              \
+                    "\nKernel height = %d, which should be less than or equal to 16", \
+                    XAI_TILE4D_GET_DIM2(coeffTile));
+    XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_STRIDE(param, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_DILATION(param, 2);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \
+                      XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile);
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeX = XAI_TILE4D_GET_DIM1(coeffTile);
+  const int32_t kSizeY = XAI_TILE4D_GET_DIM2(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t dilationU     = XAI_CNN_CONV_GET_DILATION(param);
+
+  /* Pitches of Coefficient Data (WHDN) */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Since the dilation value > 1 ,                                      */
+  /* Effective Kernel size = dilation(KernelSize - 1) + 1                */
+  /* Effective kernel size is used for calculating the min required edge */
+  int32_t dilatedKSizeX = dilationU * (kSizeX - 1) + 1;
+  int32_t dilatedKSizeY = dilationU * (kSizeY - 1) + 1;
+
+  /* For dilation equal to 2 dilated width and height will always be odd */
+  /* Condition check to evaluate left or right alignment of kernel based */
+  /* on the edge flag is not required.                                   */
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-((dilatedKSizeY / 2) * inDataPitch1 + (dilatedKSizeX / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 * restrict pdvecIn1;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2;
+
+  /* Generating two select interleave pattern to apply on accumulator values just before storing
+   * For 8 bit output
+   *     Pattern1 = 0  64 1  65 2  66    ....  31 95
+   *     Pattern2 = 32 96 33 97 34 98  ...     63 127
+   * For 16 bit output
+   *     Pattern1 = 0  1  64 65  2 3  66 67 .... 30 31 94  95
+   *     Pattern2 = 32 33 96 97 34 35 98 99  ... 62 63 126 127
+   */
+  /* 0 1 2 3 .. 62 63*/
+  xb_vec2Nx8 dvecPattern1 = IVP_SEQ2NX8();
+  /* 64 65 66 67 ...126 127*/
+  xb_vec2Nx8 dvecPattern2 = IVP_ADD2NX8(dvecPattern1, 64);
+
+  if (!typeFlag)
+  {
+    MORPH_OP_DSELI(dvecPattern2, dvecPattern1, \
+                   dvecPattern2, dvecPattern1, \
+                   IVP_DSELI_8B_INTERLEAVE_1);
+  }
+  else
+  {
+    MORPH_OP_DSELI(dvecPattern2, dvecPattern1, \
+                   dvecPattern2, dvecPattern1, \
+                   IVP_DSELI_INTERLEAVE_1);
+  }
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+  const int32_t vectorizationWidth = 4 * XCHAL_IVPN_SIMD_WIDTH - dilatedKSizeX + 1;
+  int32_t varLen;
+
+  /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time
+   * into two vectors. Also loop across output channels is unrolled twice,
+   * thereby producing four output vectors in 1 iteration.
+   *
+   * Load 128 input bytes from row corresponding to each ky
+   * dvecInData11 = a0 a1 a2 a3.... a63
+   * dvecInData12 = a64 a65 a66 .... a127
+   *
+   * Separate odd and even indices
+   * dvecInData11 = a0 a2 a4 a6.... a126
+   * dvecInData12 = a1 a3 a5 a7.... a127
+   *
+   * Let the coefficients be
+   * C11 C12 C13 ... C1kW
+   * C21 C22 C23 ... C2kW
+   *   .
+   *   .
+   * CkH1 CkH2 CkH3 ... CkHkW
+   *
+   * acc11 = [a0 a2 a4 a6.... a126] * C11 +
+   *         [a2 a4 a6.... a126 X ] * C12 +
+   *         [a4 a6.... a126 X  X ] * C13 +
+   *                .
+   *                .
+   *         [                    ] * C1kW
+   *
+   * acc12 = [a1 a3 a5 a7.... a127] * C11 +
+   *         [a3 a5 a7.... a127 X ] * C12 +
+   *         [a5 a7.... a127 X  X ] * C13 +
+   *                .
+   *                .
+   *         [                    ] * C1kW
+   *
+   * Continue the same multiplication steps for ky = 1 to kHeight -1 .
+   * acc11 and acc12 contains convolved output corresponding to even and odd indices
+   * respectively at the end of inchannel loop iterations.
+   *
+   * acc11 and acc12 are interleaved to obtain the outputs in correct order.
+   *
+   */
+
+  if (kSizeX > 12)
+  {
+    /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time
+     * into two vectors. Also loop across output channels is unrolled twice,
+     * thereby producing four output vectors in 1 iteration
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(64, inW - x);
+
+      for (y = 0; y < outH; y++)     /* Loop across Output height */
+      {
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vec2Nx8 dvecCoeffData11;
+            xb_vec2Nx8 dvecCoeffData21;
+
+            xb_vec2Nx8 dvecInData11, dvecInData12;
+
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+
+            for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+            {
+              /* loads 128 bytes of input row */
+              valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, \
+                                 dilationU * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                             IVP_DSELI_8B_DEINTERLEAVE_1);
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+              /* multiples loaded input data with first four coeff */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                2));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                2));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                2));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                2));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                3));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                3));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                3));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                3));
+            } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvec1L, dvec2L, dvec3L, dvec4L;
+          xb_vec2Nx8 dvec1H, dvec2H, dvec3H, dvec4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Interleave odd and even indices */
+          xb_vec2Nx8 dvecOut1L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern1);
+          xb_vec2Nx8 dvecOut2L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern2);
+          xb_vec2Nx8 dvecOut1H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern1);
+          xb_vec2Nx8 dvecOut2H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern2);
+          xb_vec2Nx8 dvecOut3L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern1);
+          xb_vec2Nx8 dvecOut4L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern2);
+          xb_vec2Nx8 dvecOut3H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern1);
+          xb_vec2Nx8 dvecOut4H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern2);
+
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first output depth, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen - \
+                         2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the second output depth, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * varLen) - \
+                                                          2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y ++)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else if (kSizeX > 8)
+  {
+    /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time
+     * into two vectors. Also loop across output channels is unrolled twice,
+     * thereby producing four output vectors in 1 iteration
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(64, inW - x);
+
+      for (y = 0; y < outH; y++)     /* Loop across Output height */
+      {
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel*/
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vec2Nx8 dvecCoeffData11;
+            xb_vec2Nx8 dvecCoeffData21;
+
+            xb_vec2Nx8 dvecInData11, dvecInData12;
+
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+
+            for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+            {
+              /* loads 128 bytes of input row */
+              valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, \
+                                 dilationU * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                             IVP_DSELI_8B_DEINTERLEAVE_1);
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+              /* multiples loaded input data with first four coeff */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                2));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                2));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                2));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                2));
+            } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvec1L, dvec2L, dvec3L, dvec4L;
+          xb_vec2Nx8 dvec1H, dvec2H, dvec3H, dvec4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Interleave odd and even indices */
+          xb_vec2Nx8 dvecOut1L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern1);
+          xb_vec2Nx8 dvecOut2L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern2);
+          xb_vec2Nx8 dvecOut1H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern1);
+          xb_vec2Nx8 dvecOut2H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern2);
+          xb_vec2Nx8 dvecOut3L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern1);
+          xb_vec2Nx8 dvecOut4L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern2);
+          xb_vec2Nx8 dvecOut3H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern1);
+          xb_vec2Nx8 dvecOut4H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern2);
+
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first output depth, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen - \
+                         2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the second output depth, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * varLen) - \
+                                                          2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y ++)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else if (kSizeX > 4)
+  {
+    /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time
+     * into two vectors. Also loop across output channels is unrolled twice,
+     * thereby producing four output vectors in 1 iteration
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(64, inW - x);
+
+      for (y = 0; y < outH; y++)     /* Loop across Output height */
+      {
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vec2Nx8 dvecCoeffData11;
+            xb_vec2Nx8 dvecCoeffData21;
+
+            xb_vec2Nx8 dvecInData11, dvecInData12;
+
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+
+            for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+            {
+              /* loads 128 bytes of input row */
+              valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, \
+                                 dilationU * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                             IVP_DSELI_8B_DEINTERLEAVE_1);
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+              /* multiples loaded input data with first four coeff */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+            } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvec1L, dvec2L, dvec3L, dvec4L;
+          xb_vec2Nx8 dvec1H, dvec2H, dvec3H, dvec4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Interleave odd and even indices */
+          xb_vec2Nx8 dvecOut1L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern1);
+          xb_vec2Nx8 dvecOut2L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern2);
+          xb_vec2Nx8 dvecOut1H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern1);
+          xb_vec2Nx8 dvecOut2H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern2);
+          xb_vec2Nx8 dvecOut3L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern1);
+          xb_vec2Nx8 dvecOut4L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern2);
+          xb_vec2Nx8 dvecOut3H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern1);
+          xb_vec2Nx8 dvecOut4H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern2);
+
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first output depth, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen - \
+                         2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the second output depth, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * varLen) - \
+                                                          2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y ++)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else
+  {
+    /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time
+     * into two vectors. Also loop across output channels is unrolled twice,
+     * thereby producing four output vectors in 1 iteration
+     */
+
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(64, inW - x);
+
+      for (y = 0; y < outH; y++)     /* Loop across Output height */
+      {
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vec2Nx8 dvecCoeffData11;
+            xb_vec2Nx8 dvecCoeffData21;
+
+            xb_vec2Nx8 dvecInData11, dvecInData12;
+
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+
+            for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+            {
+              /* loads 128 bytes of input row */
+              valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, \
+                                 dilationU * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                             IVP_DSELI_8B_DEINTERLEAVE_1);
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+              /* multiples loaded input data with first four coeff */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+            } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvec1L, dvec2L, dvec3L, dvec4L;
+          xb_vec2Nx8 dvec1H, dvec2H, dvec3H, dvec4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Interleave odd and even indices */
+          xb_vec2Nx8 dvecOut1L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern1);
+          xb_vec2Nx8 dvecOut2L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern2);
+          xb_vec2Nx8 dvecOut1H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern1);
+          xb_vec2Nx8 dvecOut2H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern2);
+          xb_vec2Nx8 dvecOut3L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern1);
+          xb_vec2Nx8 dvecOut4L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern2);
+          xb_vec2Nx8 dvecOut3H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern1);
+          xb_vec2Nx8 dvecOut4H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern2);
+
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first output depth, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen - \
+                         2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the second output depth, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * varLen) - \
+                                                          2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y ++)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************************
+*   xaiConvolved(VQ)3D_S_MxNj1d4I8S8IX_MOW_WHD
+*  ***************************************************************************************/
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for MxN 3D convolution   */
+/*               with dilation = 4. Based on MORPH pre-processor specifiers,  */
+/*               code implementation is generated during preprocessing stage. */
+/*               This method can be used to generate MxN 3D dialted           */
+/*               convolution function and MxN 3D VQ dialted convolution       */
+/*               function for U8 bit and S8 bit input data with input stride  */
+/*               equal to 1.                                                  */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S8                                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b      */
+/*               Output scale array is U16                                    */
+/*               OutData is S8 / U8 / S16                                     */
+/*               Kernel Size is MxNxDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolvedVQ3D_S_MxNj1d4_S8S8IX_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_MxNj1d4_U8S8IX_MOW_WHD ******************/
+/******************* xaiConvolved3D_S_MxNj1d4_S8S8IX_MOW_WHD *******************/
+/******************* xaiConvolved3D_S_MxNj1d4_U8S8IX_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_MxNj1d4), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(coeffTile) <= 16, XAI_ERR_KSIZE,           \
+                    "Kernel width = %d, which should be less than or equal to 16", \
+                    XAI_TILE4D_GET_DIM1(coeffTile));
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(coeffTile) <= 16, XAI_ERR_KSIZE,               \
+                    "\nKernel height = %d,  which should be less than or equal to 16", \
+                    XAI_TILE4D_GET_DIM2(coeffTile));
+    XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_STRIDE(param, 1);
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)),                                           \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_DILATION(param, 4);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile),                                                                                          \
+                    XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Getting parameters from the tile structures */
+  const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \
+                      XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile);
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kSizeX = XAI_TILE4D_GET_DIM1(coeffTile);
+  const int32_t kSizeY = XAI_TILE4D_GET_DIM2(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t dilationU     = XAI_CNN_CONV_GET_DILATION(param);
+
+  /* Pitches of Coefficient Data (WHDN) */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t* pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t* pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int8_t* pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  /* Since the dilation value > 1 ,                                      */
+  /* Effective Kernel size = dilation(KernelSize - 1) + 1                */
+  /* Effective kernel size is used for calculating the min required edge */
+  int32_t dilatedKSizeX = dilationU * (kSizeX - 1) + 1;
+  int32_t dilatedKSizeY = dilationU * (kSizeY - 1) + 1;
+
+  /* For dilation equal to 4 dilated width and height will always be odd */
+  /* Condition check to evaluate left or right alignment of kernel based */
+  /* on the edge flag is not required.                                   */
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-((dilatedKSizeY / 2) * inDataPitch1 + (dilatedKSizeX / 2))];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  MORPH_IDT_2Nx8 * restrict pdvecIn1;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2;
+
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+  const int32_t vectorizationWidth = 4 * XCHAL_IVPN_SIMD_WIDTH - dilatedKSizeX + 1;
+  int32_t varLen;
+
+  /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time
+   * into two vectors. Also loop across output channels is unrolled twice,
+   * thereby producing four output vectors in 1 iteration.
+   *
+   * Load 128 bytes from row corresponding to each ky
+   * dvecInData11 = a0 a1 a2 a3      ...                   a63
+   * dvecInData12 = a64 a65 a66      ...                   a127
+   *
+   * Deinterleave the indices
+   * dvecInData11 = a0 a2 a4 a6      ...                   a126
+   * dvecInData12 = a1 a3 a5 a7      ...                   a127
+   *
+   * Deinterleave the indices
+   * dvecInData11 = a0 a4 a8   ...   a124 ... a1 a5  ...   a125
+   * dvecInData12 = a2 a6 a10   ...  a126 ... a3 a7  ...   a127
+   *
+   * Let the coefficients be
+   * C11 C12 C13 ... C1kW
+   * C21 C22 C23 ... C2kW
+   *   .
+   *   .
+   * CkH1 CkH2 CkH3 ... CkHkW
+   *
+   * dacc11 = [a0 a4 a8   ...   a124 ... a1 a5  ...   a125] * C11 +
+   *         [a4 a8   ...   a124 ... a1 a5  ...   a125 X ] * C12 +
+   *         [a8   ...   a124 ... a1 a5  ...   a125 X  X ] * C13
+   *                      .
+   *                      .
+   *         [                                           ] * C1kW
+   *
+   * dacc12 = [a2 a6 a10   ...  a126 ... a3 a7  ...   a127] * C0 +
+   *         [a6 a10   ...  a126 ... a3 a7  ...   a127 X ] * C1 +
+   *         [a10   ...  a126 ... a3 a7  ...   a127 X  X ] * C2 +
+   *                      .
+   *                      .
+   *         [                                           ] * C1kW
+   *
+   *
+   * Continue the same multiplication steps for ky = 1 to kHeight -1 .
+   * dacc11 and dacc12 contains convolved output corresponding to even and odd indices
+   * respectively at the end of inchannel loop iterations.
+   *
+   * acc11 and acc12 are interleaved to obtain the outputs in correct order.
+   * Pack, Shift scale and clamp dacc11 and dacc12 to obtain dvecOut1L , dvecOut1H, dvecOut2L and dvecOut2H
+   *
+   * For 8bit output, dvecOutL contains the required output elements
+   * dvecOut1L = [A0 A4 A8 ... A116 X X A1 A5 ... A117 X X] - 64 elements
+   * dvecOut2L = [A2 A6 A10 ...A118 X X A3 A7 ... A119 X X] - 64 elements
+   * Interleave the elements
+   * dvecOut1L = [A0 A2 A4       ...    A116 A117 X X X X ] - 64 elements
+   * dvecOut2L = [A1 A3 A7       ...    A118 A119 X X X X ] - 64 elements
+   * Interleave the elements
+   * dvecOut1L = [A0 A1 A2 A3                   ...                 ]- 64 elements
+   * dvecOut2L = [  ...         A116 A117 A118 A119 X X X X X X X X ]- 64 elements
+   *
+   *
+   * For 16bit output
+   * dvecOut1L = [A0 A4 A8  ... A116 X X] - 32 16b elements
+   * dvecOut1H = [A1 A5 A9  ... A117 X X] - 32 16b elements
+   * dvecOut2L = [A2 A6 A10 ... A118 X X] - 32 16b elements
+   * dvecOut2H = [A3 A7 A11 ... A119 X X] - 32 16b elements
+   * Interleave the elements of dvecOut1L and dvecOut1H
+   * dvecOut1L = [A0 A1 A4 A5         ...      ]
+   * dvecOut1H = [ ...            A116 A117 X X]
+   * Interleave the elements of dvecOut2L and dvecOut2H
+   * dvecOut2L = [A2 A3 A6 A7               ...]
+   * dvecOut2H = [ ...            A118 A119 X X]
+   * Interleave2 the elements of dvecOut2L and dvecOut1L
+   * dvecOut1L = [A0  A1  A2  A3                        ...          ]
+   * dvecOut2L = [A32 A33 A34 A35                        ...         ]
+   * Interleave2 the elements of dvecOut2H and dvecOut1H
+   * dvecOut1H = [A64 A65 A66 A67                  ...               ]
+   * dvecOut2H = [ ...            A116 A117 A118 A119 X X X X X X X X]
+   *
+   */
+
+  if (kSizeX > 12)
+  {
+    /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time
+     * into two vectors. Also loop across output channels is unrolled twice,
+     * thereby producing four output vectors in 1 iteration
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(64, inW - x);
+
+      for (y = 0; y < outH; y++)     /* Loop across Output height */
+      {
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vec2Nx8 dvecCoeffData11;
+            xb_vec2Nx8 dvecCoeffData21;
+
+            xb_vec2Nx8 dvecInData11, dvecInData12;
+
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+
+            for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+            {
+              /* loads 128 bytes of input row */
+              valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, \
+                                 dilationU * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                             IVP_DSELI_8B_DEINTERLEAVE_1);
+              MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                             IVP_DSELI_8B_DEINTERLEAVE_1);
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+              /* multiples loaded input data with first four coeff */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                2));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                2));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                2));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                2));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                3));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                3));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                3));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                3));
+            } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* 8 bit output */
+          if (!typeFlag)
+          {
+            MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                           IVP_DSELI_8B_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                           IVP_DSELI_8B_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                           IVP_DSELI_8B_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                           IVP_DSELI_8B_INTERLEAVE_1);
+          }
+          else /* 16bit output */
+          {
+            MORPH_OP_DSELI(dvecOut1H, dvecOut1L, dvecOut1H, dvecOut1L, \
+                           IVP_DSELI_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut2H, dvecOut2L, dvecOut2H, dvecOut2L, \
+                           IVP_DSELI_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                           IVP_DSELI_INTERLEAVE_2);
+            MORPH_OP_DSELI(dvecOut2H, dvecOut1H, dvecOut2H, dvecOut1H, \
+                           IVP_DSELI_INTERLEAVE_2);
+            MORPH_OP_DSELI(dvecOut3H, dvecOut3L, dvecOut3H, dvecOut3L, \
+                           IVP_DSELI_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut4H, dvecOut4L, dvecOut4H, dvecOut4L, \
+                           IVP_DSELI_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                           IVP_DSELI_INTERLEAVE_2);
+            MORPH_OP_DSELI(dvecOut4H, dvecOut3H, dvecOut4H, dvecOut3H, \
+                           IVP_DSELI_INTERLEAVE_2);
+          }
+
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first output depth, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen - \
+                         2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the second output depth, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * varLen) - \
+                                                          2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y ++)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else if (kSizeX > 8)
+  {
+    /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time
+     * into two vectors. Also loop across output channels is unrolled twice,
+     * thereby producing four output vectors in 1 iteration
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(64, inW - x);
+
+      for (y = 0; y < outH; y++)     /* Loop across Output height */
+      {
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel*/
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vec2Nx8 dvecCoeffData11;
+            xb_vec2Nx8 dvecCoeffData21;
+
+            xb_vec2Nx8 dvecInData11, dvecInData12;
+
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+
+            for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+            {
+              /* loads 128 bytes of input row */
+              valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, \
+                                 dilationU * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                             IVP_DSELI_8B_DEINTERLEAVE_1);
+              MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                             IVP_DSELI_8B_DEINTERLEAVE_1);
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+              /* multiples loaded input data with first four coeff */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                2));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                2));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                2));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                2));
+            } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* 8 bit output */
+          if (!typeFlag)
+          {
+            MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                           IVP_DSELI_8B_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                           IVP_DSELI_8B_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                           IVP_DSELI_8B_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                           IVP_DSELI_8B_INTERLEAVE_1);
+          }
+          else /* 16bit output */
+          {
+            MORPH_OP_DSELI(dvecOut1H, dvecOut1L, dvecOut1H, dvecOut1L, \
+                           IVP_DSELI_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut2H, dvecOut2L, dvecOut2H, dvecOut2L, \
+                           IVP_DSELI_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                           IVP_DSELI_INTERLEAVE_2);
+            MORPH_OP_DSELI(dvecOut2H, dvecOut1H, dvecOut2H, dvecOut1H, \
+                           IVP_DSELI_INTERLEAVE_2);
+            MORPH_OP_DSELI(dvecOut3H, dvecOut3L, dvecOut3H, dvecOut3L, \
+                           IVP_DSELI_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut4H, dvecOut4L, dvecOut4H, dvecOut4L, \
+                           IVP_DSELI_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                           IVP_DSELI_INTERLEAVE_2);
+            MORPH_OP_DSELI(dvecOut4H, dvecOut3H, dvecOut4H, dvecOut3H, \
+                           IVP_DSELI_INTERLEAVE_2);
+          }
+
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first output depth, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen - \
+                         2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the second output depth, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * varLen) - \
+                                                          2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y ++)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else if (kSizeX > 4)
+  {
+    /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time
+     * into two vectors. Also loop across output channels is unrolled twice,
+     * thereby producing four output vectors in 1 iteration
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(64, inW - x);
+
+      for (y = 0; y < outH; y++)     /* Loop across Output height */
+      {
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vec2Nx8 dvecCoeffData11;
+            xb_vec2Nx8 dvecCoeffData21;
+
+            xb_vec2Nx8 dvecInData11, dvecInData12;
+
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+
+            for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+            {
+              /* loads 128 bytes of input row */
+              valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, \
+                                 dilationU * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                             IVP_DSELI_8B_DEINTERLEAVE_1);
+              MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                             IVP_DSELI_8B_DEINTERLEAVE_1);
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+              /* multiples loaded input data with first four coeff */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+
+              /* right rotate the input vectors by 4
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4);
+              dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4);
+
+              /* multiples input data with next four coeffs from the same row */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                1));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                1));
+            } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* 8 bit output */
+          if (!typeFlag)
+          {
+            MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                           IVP_DSELI_8B_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                           IVP_DSELI_8B_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                           IVP_DSELI_8B_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                           IVP_DSELI_8B_INTERLEAVE_1);
+          }
+          else /* 16bit output */
+          {
+            MORPH_OP_DSELI(dvecOut1H, dvecOut1L, dvecOut1H, dvecOut1L, \
+                           IVP_DSELI_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut2H, dvecOut2L, dvecOut2H, dvecOut2L, \
+                           IVP_DSELI_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                           IVP_DSELI_INTERLEAVE_2);
+            MORPH_OP_DSELI(dvecOut2H, dvecOut1H, dvecOut2H, dvecOut1H, \
+                           IVP_DSELI_INTERLEAVE_2);
+            MORPH_OP_DSELI(dvecOut3H, dvecOut3L, dvecOut3H, dvecOut3L, \
+                           IVP_DSELI_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut4H, dvecOut4L, dvecOut4H, dvecOut4L, \
+                           IVP_DSELI_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                           IVP_DSELI_INTERLEAVE_2);
+            MORPH_OP_DSELI(dvecOut4H, dvecOut3H, dvecOut4H, dvecOut3H, \
+                           IVP_DSELI_INTERLEAVE_2);
+          }
+
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first output depth, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen - \
+                         2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the second output depth, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * varLen) - \
+                                                          2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y ++)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else
+  {
+    /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time
+     * into two vectors. Also loop across output channels is unrolled twice,
+     * thereby producing four output vectors in 1 iteration
+     */
+
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(64, inW - x);
+
+      for (y = 0; y < outH; y++)     /* Loop across Output height */
+      {
+        /* initialize output data pointer */
+        int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel];
+
+        /* initialize input data pointer */
+        MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int8_t *pCoeff = &pCoeffData[0];
+        int32_t *pBias = &pBiasData[0];
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh);
+          xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4);
+
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22;
+
+          dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1);
+          IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1);
+
+          dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2);
+          IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+          pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vec2Nx8 dvecCoeffData11;
+            xb_vec2Nx8 dvecCoeffData21;
+
+            xb_vec2Nx8 dvecInData11, dvecInData12;
+
+            pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2);
+
+            for (ky = 0; ky < kSizeY; ky++)   /* Loop across kernel height */
+            {
+              /* loads 128 bytes of input row */
+              valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1);
+              MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, \
+                                 dilationU * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+
+              MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                             IVP_DSELI_8B_DEINTERLEAVE_1);
+              MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \
+                             IVP_DSELI_8B_DEINTERLEAVE_1);
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1);
+
+              /* multiples loaded input data with first four coeff */
+              MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \
+                                0));
+
+              MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+              MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32                \
+                                (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \
+                                0));
+            } /* end of for (ky = 0; ky < kSizeY; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#if DILATED_VQ_CONV == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag);
+#elif DILATED_VQ_CONV == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* 8 bit output */
+          if (!typeFlag)
+          {
+            MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                           IVP_DSELI_8B_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                           IVP_DSELI_8B_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                           IVP_DSELI_8B_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                           IVP_DSELI_8B_INTERLEAVE_1);
+          }
+          else /* 16bit output */
+          {
+            MORPH_OP_DSELI(dvecOut1H, dvecOut1L, dvecOut1H, dvecOut1L, \
+                           IVP_DSELI_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut2H, dvecOut2L, dvecOut2H, dvecOut2L, \
+                           IVP_DSELI_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \
+                           IVP_DSELI_INTERLEAVE_2);
+            MORPH_OP_DSELI(dvecOut2H, dvecOut1H, dvecOut2H, dvecOut1H, \
+                           IVP_DSELI_INTERLEAVE_2);
+            MORPH_OP_DSELI(dvecOut3H, dvecOut3L, dvecOut3H, dvecOut3L, \
+                           IVP_DSELI_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut4H, dvecOut4L, dvecOut4H, dvecOut4L, \
+                           IVP_DSELI_INTERLEAVE_1);
+            MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \
+                           IVP_DSELI_INTERLEAVE_2);
+            MORPH_OP_DSELI(dvecOut4H, dvecOut3H, dvecOut4H, dvecOut3H, \
+                           IVP_DSELI_INTERLEAVE_2);
+          }
+
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first output depth, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen - \
+                         2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Storing the second output depth, first row */
+          pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * varLen) - \
+                                                          2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \
+                         (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          pOutput += 2 * outDataPitch2 * bytesPerPixel;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y ++)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
+//#endif
+#endif /*if ((XCHAL_VISION_TYPE >= 6))*/
+
+
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW_S16.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW_S16.c
new file mode 100644
index 00000000000..7f722359d57
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW_S16.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#define DILATED_VQ_CONV_S16  VQ_FALSE
+
+#include "cnn_dilated_conv_MOW_S16.h"
+#endif //if ((XCHAL_VISION_TYPE >= 6))
+
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW_S16.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW_S16.h
new file mode 100644
index 00000000000..9a4ee1a4dc1
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW_S16.h
@@ -0,0 +1,2948 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#define VQ_TRUE   1
+#define VQ_FALSE  0
+
+#undef MAKE_NAME_VQ
+#undef MAKE_ARGUMENTS
+#undef MAKE_PARAMS
+
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+
+#define MAKE_NAME_VQ(a, b)             a ## VQ ## b
+#define MAKE_ARGUMENTS(a, b, c, d, e)  (const xai_pTile3D a, const xai_pTile4D b, const xai_pArray c, const xai_pArray outputScaleArray, xai_pTile3D d, const xai_cnn_conv_params * e)
+#define MAKE_PARAMS(a, b, c, d, e)     (a, b, c, outputScaleArray, d, e)
+
+#elif DILATED_VQ_CONV_S16 == VQ_FALSE
+
+#define MAKE_NAME_VQ(a, b)             a ## b
+#define MAKE_ARGUMENTS(a, b, c, d, e)  (const xai_pTile3D a, const xai_pTile4D b, const xai_pArray c, xai_pTile3D d, const xai_cnn_conv_params * e)
+#define MAKE_PARAMS(a, b, c, d, e)     (a, b, c, d, e)
+#endif
+
+#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER_IDT, suffix)  name ## _ ## MORPH_FNAME_SPECIFIER_IDT ## suffix
+
+#define MAKE_NAME(name, suffix)                                  MAKE_NAME_IMPL(name, S16, suffix)
+
+/*********************************************************************************
+ **************  xaiConvolved(VQ)3D_S_MxNj1d1_S16S16I16_MOW_WHD  *******************
+ **********************************************************************************/
+/*********************************************************************************/
+/* Description : P6 optimized generic implementation for MxN 3D convolution.     */
+/*               Code implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate MxN 3D dilated convolution  */
+/*               function and MxN 3D VQ dilated convolution function for S16 bit */
+/*               input data with input stride equal to 1                         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                   */
+/*               Output scale array, CNN convolution params structure            */
+/* Outputs     : XI Error Code                                                   */
+/* InOuts      : Output Tile                                                     */
+/* Assumptions : CoeffData is S16                                                */
+/*               biasArray is signed 64b, value not exceeding signed 48b         */
+/*               Output scale array is U16                                       */
+/*               OutData is S16 / U16                                            */
+/*               Kernel Size is MxNxDxN                                          */
+/*               Input and Output are in WHD format                              */
+/*               Coeff is in WHDN format                                         */
+/*********************************************************************************/
+
+/****************** xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD *********************/
+/****************** xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD *******************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_MxNj1d1), S16I16_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S16(inTile);
+    XAI_CHECK_TILE3D_I16(outTile);
+    XAI_CHECK_TILE4D_S16(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S64(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTile) <= 16) &&                   \
+                    (XAI_TILE4D_GET_DIM2(coeffTile) <= 16),                     \
+                    XAI_ERR_KSIZE, "\nKernel Width = %d and Kernel Height = %d\n \
+                    Kernel Width or Height should be less than or equal to 16", \
+                    XAI_TILE4D_GET_DIM1(coeffTile), XAI_TILE4D_GET_DIM2(coeffTile));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \
+                    XAI_ERR_BADARG, "\nStride along width = %hhu and Stride along height = %hhu\n \
+                     Stride along width should be equal to stride along height",          \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_STRIDE(param, 1);
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and Dilation along height = %hhu\n \
+                     Dilation along width should be equal to dilation along height",
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 32, \
+                    XAI_ERR_NORM, "Accumulator shift value = %hhu\nThe accumulator shift value should be less than 32",
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                                          \
+                    XAI_ERR_NORM, "Output shift = %hhu\nThe output shift value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), XAI_ERR_DATASIZE,                                                      \
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+
+#if DILATED_VQ_CONV_S16 == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+  /* Getting parameters from the tile structures */
+  const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \
+                      XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile);
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM1(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM2(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Pitches of Coefficient Data (WHDN) */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int16_t* pInData     = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int16_t* pOutData    = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int64_t* pBiasData64 = (int64_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int16_t* pCoeffData  = (int16_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV_S16 == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  int32_t leftEdge, topEdge;
+  int32_t minLim, maxLim;
+
+  if ((kWidthU % 2) != 0)
+  {
+    leftEdge = kWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1);
+  }
+
+  if ((kHeightU % 2) != 0)
+  {
+    topEdge = kHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MIN : 0;
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX : USHRT_MAX;
+  }
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, inCh;
+
+  xb_vecNx16 * restrict pvecIn1;
+  xb_vecNx16 * restrict pvecIn2;
+  xb_vecNx16* restrict pvecOut;
+  xb_vecN_2x32v* restrict phvecCoeff1, *restrict phvecCoeff2;
+  xb_vec2Nx8 *restrict pdvecBias64;
+
+  xb_vec2Nx8 seq1 = IVP_ADD2NX8(IVP_SEQ2NX8(), 2);
+  xb_vec2Nx8 seq2 = IVP_ADD2NX8(IVP_SEQ2NX8(), 34);
+  seq2 = IVP_MIN2NX8(seq2, 64);
+  xb_vec2Nx8 dvecSel = IVP_SEL2NX8I(seq2, seq1, IVP_SELI_8B_INTERLEAVE_1_LO);
+  /* Variable Declarations */
+  const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH;
+  int32_t varLen;
+  if (kWidthU > 12)
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(32, inW - x);
+
+      for (y = 0; y < outH; y++)    /* Loop across Output height */
+      {
+        /* initialize output data pointer */
+        int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)];
+
+        /* initialize input data pointer */
+        int16_t *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int16_t *pCoeff = &pCoeffData[0];
+        pdvecBias64 = (xb_vec2Nx8 *) pBiasData64;
+        valign vaBias = IVP_LA2NX8_PP(pdvecBias64);
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vecNx48 accSum11, accSum21;
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1);
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1);
+
+          phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vecN_2x32v hvecCoeffData11;
+            xb_vecN_2x32v hvecCoeffData21;
+
+            /* vecInData11 refers to 1st input row, first 32(or lesser) elements
+             * and vecInData12 refers to next few left out elements of the same row
+             * required to compute one 32 way output vector(To compute one 32 way
+             * output vector, we require 32 + edge1 + edge2 number of input elements)
+             */
+            xb_vecNx16 vecInData11, vecInData12, vecInData11A;
+
+            pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2);
+
+            for (ky = 0; ky < kHeightU; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+              valign vaInData = IVP_LANX16_PP(pvecIn1);
+              IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2);
+
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 2));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 2));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 3));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 3));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 4));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 4));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 5));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 5));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 6));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 6));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 7));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 7));
+            } /* end of for (ky = 0; ky < kHeightU; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vecNx16 vecOut1L, vecOut3L;
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+#elif DILATED_VQ_CONV_S16 == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first row , first depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2);
+          IVP_SAVNX16_XP(vecOut3L, vaOutData, pvecOut, 2 * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+
+          pOutput += 2 * outDataPitch2;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else if (kWidthU > 8)
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(32, inW - x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* handles odd output row */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+        /* initialize output data pointer */
+        int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)];
+
+        /* initialize input data pointer */
+        int16_t *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int16_t *pCoeff = &pCoeffData[0];
+        pdvecBias64 = (xb_vec2Nx8 *) pBiasData64;
+        valign vaBias = IVP_LA2NX8_PP(pdvecBias64);
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel*/
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vecNx48 accSum11, accSum12, accSum21, accSum22;
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1);
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh);
+          accSum12 = accSum11; accSum22 = accSum21;
+
+          /* priming of coeff load is done outside the innermost loop*/
+          phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1);
+
+          phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vecN_2x32v hvecCoeffData11;
+            xb_vecN_2x32v hvecCoeffData21;
+
+            /* vecInData11 refers to 1st input row, first 32(or lesser) elements
+             * and vecInData12 refers to next few left out elements of the same row
+             * required to compute one 32 way output vector(To compute one 32 way
+             * output vector, we require 32 + edge1 + edge2 number of input elements)
+             */
+            xb_vecNx16 vecInData11, vecInData12, vecInData11A;
+            xb_vecNx16 vecInData21, vecInData22, vecInData21A;
+
+            pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2);
+            pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + inDataPitch1 * enable2ndRow);
+
+            for (ky = 0; ky < kHeightU; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+              valign vaInData = IVP_LANX16_PP(pvecIn1);
+              IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* loads 2nd input row */
+              vaInData = IVP_LANX16_PP(pvecIn2);
+              IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+              IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+              IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 2));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 2));
+              IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 2));
+              IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 2));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 3));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 3));
+              IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 3));
+              IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 3));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 4));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 4));
+              IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 4));
+              IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 4));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 5));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 5));
+              IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 5));
+              IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 5));
+            } /* end of for (ky = 0; ky < kHeightU; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vecNx16 vecOut1L, vecOut2L, vecOut3L, vecOut4L;
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+#elif DILATED_VQ_CONV_S16 == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first row , first depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2);
+          IVP_SAVNX16_XP(vecOut3L, vaOutData, pvecOut, 2 * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1);
+          IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndRow * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                               enable2ndRow * outDataPitch1));
+          IVP_SAVNX16_XP(vecOut4L, vaOutData, pvecOut, 2 * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          pOutput += 2 * outDataPitch2;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else if (kWidthU > 4)
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(32, inW - x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* handles odd output row */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+        /* initialize output data pointer */
+        int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)];
+
+        /* initialize input data pointer */
+        int16_t *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int16_t *pCoeff = &pCoeffData[0];
+        pdvecBias64 = (xb_vec2Nx8 *) pBiasData64;
+        valign vaBias = IVP_LA2NX8_PP(pdvecBias64);
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vecNx48 accSum11, accSum12, accSum21, accSum22;
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1);
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh);
+          accSum12 = accSum11; accSum22 = accSum21;
+
+          /* priming of coeff load is done outside the innermost loop*/
+          phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1);
+
+          phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vecN_2x32v hvecCoeffData11;
+            xb_vecN_2x32v hvecCoeffData21;
+
+            /* vecInData11 refers to 1st input row, first 32(or lesser) elements
+             * and vecInData12 refers to next few left out elements of the same row
+             * required to compute one 32 way output vector(To compute one 32 way
+             * output vector, we require 32 + edge1 + edge2 number of input elements)
+             */
+            xb_vecNx16 vecInData11, vecInData12, vecInData11A;
+            xb_vecNx16 vecInData21, vecInData22, vecInData21A;
+
+            pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2);
+            pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + inDataPitch1 * enable2ndRow);
+
+            for (ky = 0; ky < kHeightU; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+              valign vaInData = IVP_LANX16_PP(pvecIn1);
+              IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* loads 2nd input row */
+              vaInData = IVP_LANX16_PP(pvecIn2);
+              IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+              IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+              /* multiples loaded input data with first four coeff */
+
+              /* right rotate the input vectors by 2 elements
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+              IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+
+              /* right rotate the input vectors by 2 elements
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 2));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 2));
+              IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 2));
+              IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 2));
+
+              /* right rotate the input vectors by 2 elements
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 3));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 3));
+              IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 3));
+              IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 3));
+            } /* end of for (ky = 0; ky < kHeightU; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vecNx16 vecOut1L, vecOut2L, vecOut3L, vecOut4L;
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+#elif DILATED_VQ_CONV_S16 == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first row , first depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2);
+          IVP_SAVNX16_XP(vecOut3L, vaOutData, pvecOut, 2 * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1);
+          IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndRow * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                               enable2ndRow * outDataPitch1));
+          IVP_SAVNX16_XP(vecOut4L, vaOutData, pvecOut, 2 * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          pOutput += 2 * outDataPitch2;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(32, inW - x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* handles odd output row */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+        /* initialize output data pointer */
+        int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)];
+
+        /* initialize input data pointer */
+        int16_t *pInput = &pInData[inDataPitch1 * (y) + (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int16_t *pCoeff = &pCoeffData[0];
+        pdvecBias64 = (xb_vec2Nx8 *) pBiasData64;
+        valign vaBias = IVP_LA2NX8_PP(pdvecBias64);
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vecNx48 accSum11, accSum12, accSum21, accSum22;
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1);
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh);
+          accSum12 = accSum11; accSum22 = accSum21;
+
+          /* priming of coeff load is done outside the innermost loop*/
+          phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1);
+
+          phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+            xb_vecN_2x32v hvecCoeffData11;
+            xb_vecN_2x32v hvecCoeffData21;
+
+            /* vecInData11 refers to 1st input row, first 32(or lesser) elements
+             * and vecInData12 refers to next few left out elements of the same row
+             * required to compute one 32 way output vector(To compute one 32 way
+             * output vector, we require 32 + edge1 + edge2 number of input elements)
+             */
+            xb_vecNx16 vecInData11, vecInData12, vecInData11A;
+            xb_vecNx16 vecInData21, vecInData22, vecInData21A;
+
+            pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2);
+            pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + inDataPitch1 * enable2ndRow);
+
+#ifdef IS_VISION_130
+            for (ky = 0; ky < kHeightU; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+              IVP_L2UNX16_XP(vecInData11, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_L2UNX16_XP(vecInData12, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* loads 2nd input row */
+              IVP_L2UNX16_XP(vecInData21, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_L2UNX16_XP(vecInData22, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+              IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+              IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+            } /* for (ky = 0; ky < kHeightU; ky++)*/
+
+#else
+            for (ky = 0; ky < kHeightU; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+              valign vaInData = IVP_LANX16_PP(pvecIn1);
+              IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* loads 2nd input row */
+              vaInData = IVP_LANX16_PP(pvecIn2);
+              IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+              IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2);
+              vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2);
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+              IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+            } /* for (ky = 0; ky < kHeightU; ky++)*/
+#endif
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vecNx16 vecOut1L, vecOut2L, vecOut3L, vecOut4L;
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+#elif DILATED_VQ_CONV_S16 == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first row , first depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2);
+          IVP_SAVNX16_XP(vecOut3L, vaOutData, pvecOut, 2 * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1);
+          IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndRow * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                               enable2ndRow * outDataPitch1));
+          IVP_SAVNX16_XP(vecOut4L, vaOutData, pvecOut, 2 * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          pOutput += 2 * outDataPitch2;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************
+ *   xaiConvolved(VQ)3D_S_MxNj2d1_S16S16I16_MOW_WHD
+ *  ****************************************************************************/
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for MxN 3D convolution   */
+/*               with stride = 2. Code implementation is generated during     */
+/*               preprocessing stage. This method can be used to generate     */
+/*               MxN 3D dilated convolution function and MxN 3D VQ dilated    */
+/*               convolution function for S16 bit input data with input stride*/
+/*               equal to 1                                                   */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S16                                             */
+/*               biasArray is signed 64, value not exceeding signed 48b       */
+/*               Output scale array is U16                                    */
+/*               OutData is S16 / U16                                         */
+/*               Kernel Size is MxNxDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD ******************/
+/****************** xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD ****************/
+
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_MxNj2d1), S16I16_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S16(inTile);
+    XAI_CHECK_TILE3D_I16(outTile);
+    XAI_CHECK_TILE4D_S16(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S64(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTile) <= 16) &&                   \
+                    (XAI_TILE4D_GET_DIM2(coeffTile) <= 16),                     \
+                    XAI_ERR_KSIZE, "Kernel Width = %u and Kernel Height = %u\n \
+                    Kernel Width or Height should be less than or equal to 16", \
+                    XAI_TILE4D_GET_DIM1(coeffTile), XAI_TILE4D_GET_DIM2(coeffTile));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \
+                    XAI_ERR_BADARG, "Stride along width = %u and Stride along height = %u\n \
+                     Stride along width should be equal to stride along height.",         \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_STRIDE(param, 2);
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \
+                    XAI_ERR_BADARG, "Dilation along width = %u and Dilation along height = %u\n \
+                     Dilation along width should be equal to dilation along height.",       \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 32,                                                         \
+                    XAI_ERR_NORM, "Accumulator shift value = %u\nThe accumulator shift value should be less than 32", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                                        \
+                    XAI_ERR_NORM, "Output shift = %u\nThe output shift value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \
+                    XAI_ERR_DATASIZE, "Width of Output Scale Array = %u and Number of Kernels = %u\n \
+      Width of Output Scale Array should be greater than or equal to number of kernels.",    \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV_S16 == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+  /* Getting parameters from the tile structures */
+  const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \
+                      XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile);
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM1(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM2(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Pitches of Coefficient Data (WHDN) */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int16_t* pInData     = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int16_t* pOutData    = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int64_t* pBiasData64 = (int64_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int16_t* pCoeffData  = (int16_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV_S16 == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  int32_t leftEdge, topEdge;
+  int32_t minLim, maxLim;
+
+  if ((kWidthU % 2) != 0)
+  {
+    leftEdge = kWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1);
+  }
+
+  if ((kHeightU % 2) != 0)
+  {
+    topEdge = kHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1);
+  }
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MIN : 0;
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX : USHRT_MAX;
+  }
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+  int32_t varLen;
+
+  xb_vecNx16 * restrict pvecIn1;
+  xb_vecNx16 * restrict pvecIn2;
+  xb_vecNx16* restrict pvecOut;
+  xb_vecN_2x32v* restrict phvecCoeff1, *restrict phvecCoeff2;
+  xb_vec2Nx8 *restrict pdvecBias64;
+
+  xb_vec2Nx8 seq1 = IVP_ADD2NX8(IVP_SEQ2NX8(), 1);
+  xb_vec2Nx8 seq2 = IVP_ADD2NX8(IVP_SEQ2NX8(), 33);
+  seq2 = IVP_MIN2NX8(seq2, 64);
+  xb_vec2Nx8 dvecSel = IVP_SEL2NX8I(seq2, seq1, IVP_SELI_8B_INTERLEAVE_1_LO);
+
+  /* Number of output elements that can be generated
+   * with 2 input vector loads(32 way).*/
+  const int32_t vectorizationWidth = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kWidthU) / stride) + 1;
+
+  if (kWidthU > 12)
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* In order to handle odd output height  */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+
+        /* initialize output data pointer */
+        int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)];
+
+        /* initialize input data pointer */
+        int16_t *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int16_t *pCoeff = &pCoeffData[0];
+        pdvecBias64 = (xb_vec2Nx8 *) pBiasData64;
+        valign vaBias = IVP_LA2NX8_PP(pdvecBias64);
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vecNx48 accSum11, accSum12, accSum21, accSum22;
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1);
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh);
+          accSum12 = accSum11; accSum22 = accSum21;
+
+          /* priming of coeff load is done outside the innermost loop*/
+          phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1);
+
+          phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vecN_2x32v hvecCoeffData11;
+            xb_vecN_2x32v hvecCoeffData21;
+
+            /* vecInData11 refers to 1st input row, first 32(or lesser) elements
+             * and vecInData12 refers to next few left out elements of the same row
+             * required to compute one 32 way output vector(To compute one 32 way
+             * output vector, we require 32 + edge1 + edge2 number of input elements)
+             */
+            xb_vecNx16 vecInData11, vecInData12;
+            xb_vecNx16 vecInData21, vecInData22;
+
+            pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2);
+            pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + \
+                                      stride * inDataPitch1 * enable2ndRow);
+
+            for (ky = 0; ky < kHeightU; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+              valign vaInData = IVP_LANX16_PP(pvecIn1);
+              IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* loads Next(3rd) input row, corresponding to 2nd output row */
+
+              vaInData = IVP_LANX16_PP(pvecIn2);
+              IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* Re-arrange the data in the desired format                                    */
+              /* Assume input as 1,2,3,4,5,6,7...64                                          */
+              /* After re-arrangement using DSEL operation, updated vectors would be */
+              /* vecInData1 : 1,  3,  5,...61                                              */
+              /* vecInData2 : 2,  4,  6,...62                                              */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, IVP_SEQ2NX8());
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, IVP_SEQ2NX8());
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+
+              /* right rotate the input vectors by 2 elements
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 2));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 2));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 2));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 2));
+
+              /* right rotate the input vectors by 2 elements
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 3));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 3));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 3));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 3));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 4));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 4));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 4));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 4));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 5));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 5));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 5));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 5));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 6));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 6));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 6));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 6));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 7));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 7));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 7));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 7));
+            } /* end of for (ky = 0; ky < kHeightU; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vecNx16 vecOut1L, vecOut2L, vecOut3L, vecOut4L;
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+#elif DILATED_VQ_CONV_S16 == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first row , first depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2);
+          IVP_SAVNX16_XP(vecOut3L, vaOutData, pvecOut, 2 * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1);
+          IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndRow * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                               enable2ndRow * outDataPitch1));
+          IVP_SAVNX16_XP(vecOut4L, vaOutData, pvecOut, 2 * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          pOutput += 2 * outDataPitch2;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else if (kWidthU > 8)
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)     /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+
+      for (y = 0; y < outH; y += 2)      /* Loop across Output height */
+      {
+        /* In order to handle odd output height */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+        /* initialize output data pointer */
+        int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)];
+
+        /* initialize input data pointer */
+        int16_t *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int16_t *pCoeff = &pCoeffData[0];
+        pdvecBias64 = (xb_vec2Nx8 *) pBiasData64;
+        valign vaBias = IVP_LA2NX8_PP(pdvecBias64);
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)     /* Loop across Output depth */
+        {
+          /* handles odd output channel*/
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vecNx48 accSum11, accSum12, accSum21, accSum22;
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1);
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh);
+          accSum12 = accSum11; accSum22 = accSum21;
+
+          /* priming of coeff load is done outside the innermost loop*/
+          phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1);
+
+          phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)     /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vecN_2x32v hvecCoeffData11;
+            xb_vecN_2x32v hvecCoeffData21;
+
+            /* vecInData11 refers to 1st input row, first 32(or lesser) elements
+             * and vecInData12 refers to next few left out elements of the same row
+             * required to compute one 32 way output vector(To compute one 32 way
+             * output vector, we require 32 + edge1 + edge2 number of input elements)
+             */
+            xb_vecNx16 vecInData11, vecInData12;
+            xb_vecNx16 vecInData21, vecInData22;
+
+            pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2);
+            pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + \
+                                      stride * inDataPitch1 * enable2ndRow);
+
+            for (ky = 0; ky < kHeightU; ky++)     /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+              valign vaInData = IVP_LANX16_PP(pvecIn1);
+              IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* loads Next(3rd) input row, corresponding to 2nd output row */
+              vaInData = IVP_LANX16_PP(pvecIn2);
+              IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* Re-arrange the data in the desired format                                    */
+              /* Assume input as 1,2,3,4,5,6,7...64                                          */
+              /* After re-arrangement using DSEL operation, updated vectors would be */
+              /* vecInData1 : 1,  3,  5,...61                                              */
+              /* vecInData2 : 2,  4,  6,...62                                              */
+
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, IVP_SEQ2NX8());
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, IVP_SEQ2NX8());
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+
+              /* right rotate the input vectors by 2 elements
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 2));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 2));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 2));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 2));
+
+              /* right rotate the input vectors by 2 elements
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 3));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 3));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 3));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 3));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 4));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 4));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 4));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 4));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 5));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 5));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 5));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 5));
+            }   /* end of for (ky = 0; ky < kHeightU; ky++)*/
+          }     /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vecNx16 vecOut1L, vecOut2L, vecOut3L, vecOut4L;
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+#elif DILATED_VQ_CONV_S16 == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first row , first depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2);
+          IVP_SAVNX16_XP(vecOut3L, vaOutData, pvecOut, 2 * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1);
+          IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndRow * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                               enable2ndRow * outDataPitch1));
+          IVP_SAVNX16_XP(vecOut4L, vaOutData, pvecOut, 2 * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          pOutput += 2 * outDataPitch2;
+          pCoeff  += 2 * coeffPitch3;
+        }   /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }     /* end of for (y = 0; y < outH; y += 2)*/
+    }       /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else if (kWidthU > 4)
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* In order to handle odd output height */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+        /* initialize output data pointer */
+        int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)];
+
+        /* initialize input data pointer */
+        int16_t *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int16_t *pCoeff = &pCoeffData[0];
+        pdvecBias64 = (xb_vec2Nx8 *) pBiasData64;
+        valign vaBias = IVP_LA2NX8_PP(pdvecBias64);
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vecNx48 accSum11, accSum12, accSum21, accSum22;
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1);
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh);
+          accSum12 = accSum11; accSum22 = accSum21;
+
+          /* priming of coeff load is done outside the innermost loop*/
+          phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1);
+
+          phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+
+            xb_vecN_2x32v hvecCoeffData11;
+            xb_vecN_2x32v hvecCoeffData21;
+
+            /* vecInData11 refers to 1st input row, first 32(or lesser) elements
+             * and vecInData12 refers to next few left out elements of the same row
+             * required to compute one 32 way output vector(To compute one 32 way
+             * output vector, we require 32 + edge1 + edge2 number of input elements)
+             */
+            xb_vecNx16 vecInData11, vecInData12;
+            xb_vecNx16 vecInData21, vecInData22;
+
+            pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2);
+            pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + \
+                                      stride * inDataPitch1 * enable2ndRow);
+
+            for (ky = 0; ky < kHeightU; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+              valign vaInData = IVP_LANX16_PP(pvecIn1);
+              IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* loads Next(3rd) input row, corresponding to 2nd output row */
+              vaInData = IVP_LANX16_PP(pvecIn2);
+              IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* Re-arrange the data in the desired format                                    */
+              /* Assume input as 1,2,3,4,5,6,7...64                                          */
+              /* After re-arrangement using DSEL operation, updated vectors would be */
+              /* vecInData1 : 1,  3,  5,...61                                              */
+              /* vecInData2 : 2,  4,  6,...62                                              */
+
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, IVP_SEQ2NX8());
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, IVP_SEQ2NX8());
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+
+              /* right rotate the input vectors by 2 elements
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 2));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 2));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 2));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 2));
+
+              /* right rotate the input vectors by 2 elements
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 3));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 3));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 3));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 3));
+            } /* end of for (ky = 0; ky < kHeightU; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vecNx16 vecOut1L, vecOut2L, vecOut3L, vecOut4L;
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+#elif DILATED_VQ_CONV_S16 == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first row , first depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2);
+          IVP_SAVNX16_XP(vecOut3L, vaOutData, pvecOut, 2 * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1);
+          IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndRow * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                               enable2ndRow * outDataPitch1));
+          IVP_SAVNX16_XP(vecOut4L, vaOutData, pvecOut, 2 * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          pOutput += 2 * outDataPitch2;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* In order to handle odd output height */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+        /* initialize output data pointer */
+        int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)];
+
+        /* initialize input data pointer */
+        int16_t *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int16_t *pCoeff = &pCoeffData[0];
+        pdvecBias64 = (xb_vec2Nx8 *) pBiasData64;
+        valign vaBias = IVP_LA2NX8_PP(pdvecBias64);
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vecNx48 accSum11, accSum12, accSum21, accSum22;
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1);
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh);
+          accSum12 = accSum11; accSum22 = accSum21;
+
+          /* priming of coeff load is done outside the innermost loop*/
+          phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1);
+
+          phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+            xb_vecN_2x32v hvecCoeffData11;
+            xb_vecN_2x32v hvecCoeffData21;
+
+            /* vecInData11 refers to 1st input row, first 32(or lesser) elements
+             * and vecInData12 refers to next few left out elements of the same row
+             * required to compute one 32 way output vector(To compute one 32 way
+             * output vector, we require 32 + edge1 + edge2 number of input elements)
+             */
+            xb_vecNx16 vecInData11, vecInData12;
+            xb_vecNx16 vecInData21, vecInData22;
+
+            pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2);
+            pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + \
+                                      stride * inDataPitch1 * enable2ndRow);
+
+            for (ky = 0; ky < kHeightU; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+              valign vaInData = IVP_LANX16_PP(pvecIn1);
+              IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* loads Next(3rd) input row, corresponding to 2nd output row */
+
+              vaInData = IVP_LANX16_PP(pvecIn2);
+              IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* Re-arrange the data in the desired format                                    */
+              /* Assume input as 1,2,3,4,5,6,7...64                                          */
+              /* After re-arrangement using DSEL operation, updated vectors would be */
+              /* vecInData1 : 1,  3,  5,...61                                              */
+              /* vecInData2 : 2,  4,  6,...62                                              */
+
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, IVP_SEQ2NX8());
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, IVP_SEQ2NX8());
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+
+              IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel);
+              IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+              IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+            } /* for (ky = 0; ky < kHeightU; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vecNx16 vecOut1L, vecOut2L, vecOut3L, vecOut4L;
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+#elif DILATED_VQ_CONV_S16 == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          /* Storing the first row , first depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2);
+          IVP_SAVNX16_XP(vecOut3L, vaOutData, pvecOut, 2 * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1);
+          IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndRow * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                               enable2ndRow * outDataPitch1));
+          IVP_SAVNX16_XP(vecOut4L, vaOutData, pvecOut, 2 * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          pOutput += 2 * outDataPitch2;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/******************************************************************************
+ *   xaiConvolved(VQ)3D_S_MxNj4d1_S16S16I16_MOW_WHD
+ *  ****************************************************************************/
+/******************************************************************************/
+/* Description : P6 optimized generic implementation for MxN 3D convolution   */
+/*               with stride = 4. Code implementation is generated during     */
+/*               preprocessing stage. This method can be used to generate     */
+/*               MxN 3D dilated convolution function and MxN 3D VQ dilated    */
+/*               convolution function for S16 bit input data with input stride*/
+/*               equal to 4.                                                  */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,                */
+/*               Output scale array, CNN convolution params structure         */
+/* Outputs     : XI Error Code                                                */
+/* InOuts      : Output Tile                                                  */
+/* Assumptions : CoeffData is S16                                             */
+/*               biasArray is signed 64, value not exceeding signed 48b       */
+/*               Output scale array is U16                                    */
+/*               OutData is S16 / U16                                         */
+/*               Kernel Size is MxNxDxN                                       */
+/*               Input and Output are in WHD format                           */
+/*               Coeff is in WHDN format                                      */
+/******************************************************************************/
+
+/****************** xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD *********************/
+/****************** xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD *******************/
+XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_MxNj4d1), S16I16_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S16(inTile);
+    XAI_CHECK_TILE3D_I16(outTile);
+    XAI_CHECK_TILE4D_S16(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S64(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTile) <= 16) &&                   \
+                    (XAI_TILE4D_GET_DIM2(coeffTile) <= 16),                     \
+                    XAI_ERR_KSIZE, "Kernel Width = %u and Kernel Height = %u\n \
+                    Kernel Width or Height should be less than or equal to 16", \
+                    XAI_TILE4D_GET_DIM1(coeffTile), XAI_TILE4D_GET_DIM2(coeffTile));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \
+                    XAI_ERR_BADARG, "Stride along width = %u and Stride along height = %u\n \
+                     Stride along width should be equal to stride along height.",         \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_STRIDE(param, 4);
+    XAI_CHECK_DILATION(param, 1);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \
+                    XAI_ERR_BADARG, "Dilation along width = %u and Dilation along height = %u\n \
+                     Dilation along width should be equal to dilation along height.",       \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 32,                                                         \
+                    XAI_ERR_NORM, "Accumulator shift value = %u\nThe accumulator shift value should be less than 32", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                                        \
+                    XAI_ERR_NORM, "Output shift = %u\nThe output shift value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \
+                    XAI_ERR_DATASIZE, "Width of Output Scale Array = %u and Number of Kernels = %u\n \
+      Width of Output Scale Array should be greater than or equal to number of kernels.",    \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+    XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \
+                      0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary");
+#endif
+  }
+#if DILATED_VQ_CONV_S16 == VQ_FALSE
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+  /* Getting parameters from the tile structures */
+  const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \
+                      XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile);
+  const int32_t outW     = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile);
+
+  /* Kernel Size (WHDN)*/
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM1(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM2(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t stride        = XAI_CNN_CONV_GET_STRIDE(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+
+  /* Pitches of Coefficient Data (WHDN) */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (WHD) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (WHD) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int16_t* pInData     = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int16_t* pOutData    = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int64_t* pBiasData64 = (int64_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+  int16_t* pCoeffData  = (int16_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+  uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#elif DILATED_VQ_CONV_S16 == VQ_FALSE
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  int32_t leftEdge, topEdge;
+  int32_t minLim, maxLim;
+
+  if ((kWidthU % 2) != 0)
+  {
+    leftEdge = kWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1);
+  }
+
+  if ((kHeightU % 2) != 0)
+  {
+    topEdge = kHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MIN : 0;
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX : USHRT_MAX;
+  }
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, ky;
+  int32_t varLen;
+
+  xb_vecNx16 * restrict pvecIn1;
+  xb_vecNx16 * restrict pvecIn2;
+  xb_vecNx16* restrict pvecOut;
+  xb_vecN_2x32v* restrict phvecCoeff1, *restrict phvecCoeff2;
+  xb_vec2Nx8 *restrict pdvecBias64;
+
+  /* Number of output elements that can be generated
+   * with 2 input vector loads(32 way).*/
+  const int32_t vectorizationWidth = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kWidthU) / stride) + 1;
+
+  if (kWidthU > 12)
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* In order to handle odd output height */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+        /* initialize output data pointer */
+        int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)];
+
+        /* initialize input data pointer */
+        int16_t *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int16_t *pCoeff = &pCoeffData[0];
+        pdvecBias64 = (xb_vec2Nx8 *) pBiasData64;
+        valign vaBias = IVP_LA2NX8_PP(pdvecBias64);
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vecNx48 accSum11, accSum21;
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1);
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1);
+
+          phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+            xb_vecN_2x32v hvecCoeffData11;
+            xb_vecN_2x32v hvecCoeffData21;
+
+            /* vecInData11 refers to 1st input row, first 32(or lesser) elements
+             * and vecInData12 refers to next few left out elements of the same row
+             * required to compute one 32 way output vector(To compute one 32 way
+             * output vector, we require 32 + edge1 + edge2 number of input elements)
+             */
+            xb_vecNx16 vecData1, vecData2, vecData3, vecData4;
+            xb_vecNx16 vecData5, vecData6, vecData7, vecData8;
+            xb_vecNx16 vecData9, vecData10, vecData11, vecData12;
+            xb_vecNx16 vecData13, vecData14, vecData15, vecData16;
+            xb_vecNx16 vecInData11, vecInData12;
+            xb_vecNx16 vecInData21, vecInData22;
+
+            pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2);
+            pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + \
+                                      stride * inDataPitch1 * enable2ndRow);
+
+            for (ky = 0; ky < kHeightU; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+              valign vaInData = IVP_LANX16_PP(pvecIn1);
+              IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* loads Next(5th) input row, corresponding to 2nd output row */
+              vaInData = IVP_LANX16_PP(pvecIn2);
+              IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+               * If 1st input row is 0,1,2,3,...63, and the 2nd input row is
+               * 64,65,66,67.........127, Data should be arranged  as
+               *
+               * vecData1 : 0, 4, 8,...56,60,  64,68,72,...120,124
+               * vecData2 : 1, 5, 9,...57,61,  65,69,73,...121,125
+               * vecData3 : 2, 6,10,...58,62,  66,70,74,...122,126
+               * vecData4 : 3, 7,11,...59,63,  67,71,75,...123,127
+               *
+               * Lower half of the vectors contain data from 1st output row and
+               * upper half of the vectors contain data from 2nd output row.
+               */
+
+              IVP_DSELNX16(vecData2, vecData1,
+                           IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_0),
+                           IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_0),
+                           IVP_SEQ2NX8());
+              IVP_DSELNX16(vecData4, vecData3,
+                           IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_2),
+                           IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_2),
+                           IVP_SEQ2NX8());
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData2, vecData1, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum21, vecData2, vecData1, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+              /* multiples loaded input data with 2nd two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData4, vecData3, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum21, vecData4, vecData3, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+
+              /* right rotate the input vectors by 2 elements
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              vecData5 = IVP_SELNX16I(0, vecData1, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData6 = IVP_SELNX16I(0, vecData2, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData7 = IVP_SELNX16I(0, vecData3, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData8 = IVP_SELNX16I(0, vecData4, IVP_SELI_16B_ROTATE_RIGHT_1);
+
+              /* multiples loaded input data with 3rd two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData6, vecData5, IVP_EXTRN_2X32(hvecCoeffData11, 2));
+              IVP_MULPAN16XR16(accSum21, vecData6, vecData5, IVP_EXTRN_2X32(hvecCoeffData21, 2));
+              /* multiples loaded input data with 4th two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData8, vecData7, IVP_EXTRN_2X32(hvecCoeffData11, 3));
+              IVP_MULPAN16XR16(accSum21, vecData8, vecData7, IVP_EXTRN_2X32(hvecCoeffData21, 3));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              vecData9  = IVP_SELNX16I(0, vecData5, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData10 = IVP_SELNX16I(0, vecData6, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData11 = IVP_SELNX16I(0, vecData7, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData12 = IVP_SELNX16I(0, vecData8, IVP_SELI_16B_ROTATE_RIGHT_1);
+
+              /* multiples loaded input data with 5th two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData10, vecData9, IVP_EXTRN_2X32(hvecCoeffData11, 4));
+              IVP_MULPAN16XR16(accSum21, vecData10, vecData9, IVP_EXTRN_2X32(hvecCoeffData21, 4));
+              /* multiples loaded input data with 6th two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData12, vecData11, IVP_EXTRN_2X32(hvecCoeffData11, 5));
+              IVP_MULPAN16XR16(accSum21, vecData12, vecData11, IVP_EXTRN_2X32(hvecCoeffData21, 5));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              vecData13 = IVP_SELNX16I(0, vecData9, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData14 = IVP_SELNX16I(0, vecData10, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData15 = IVP_SELNX16I(0, vecData11, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData16 = IVP_SELNX16I(0, vecData12, IVP_SELI_16B_ROTATE_RIGHT_1);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData14, vecData13, IVP_EXTRN_2X32(hvecCoeffData11, 6));
+              IVP_MULPAN16XR16(accSum21, vecData14, vecData13, IVP_EXTRN_2X32(hvecCoeffData21, 6));
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData16, vecData15, IVP_EXTRN_2X32(hvecCoeffData11, 7));
+              IVP_MULPAN16XR16(accSum21, vecData16, vecData15, IVP_EXTRN_2X32(hvecCoeffData21, 7));
+            } /* end of for (ky = 0; ky < kHeightU; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vecNx16 vecOut1Ch, vecOut1L, vecOut1H;
+          xb_vecNx16 vecOut2Ch, vecOut2L, vecOut2H;
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1Ch, accSum11, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2Ch, accSum21, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+#elif DILATED_VQ_CONV_S16 == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1Ch, accSum11, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2Ch, accSum21, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          vecOut1L = IVP_SELNX16I(0, vecOut1Ch, IVP_SELI_16B_EXTRACT_LO_HALVES);
+          vecOut2L = IVP_SELNX16I(0, vecOut2Ch, IVP_SELI_16B_EXTRACT_LO_HALVES);
+          vecOut1H = IVP_SELNX16I(0, vecOut1Ch, IVP_SELI_16B_EXTRACT_HI_HALVES);
+          vecOut2H = IVP_SELNX16I(0, vecOut2Ch, IVP_SELI_16B_EXTRACT_HI_HALVES);
+          /* Storing the first row , first depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2);
+          IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1);
+          IVP_SAVNX16_XP(vecOut1H, vaOutData, pvecOut, 2 * enable2ndRow * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                               enable2ndRow * outDataPitch1));
+          IVP_SAVNX16_XP(vecOut2H, vaOutData, pvecOut, 2 * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          pOutput += 2 * outDataPitch2;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else if (kWidthU > 8)
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* In order to handle odd output height */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+        /* initialize output data pointer */
+        int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)];
+
+        /* initialize input data pointer */
+        int16_t *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int16_t *pCoeff = &pCoeffData[0];
+        pdvecBias64 = (xb_vec2Nx8 *) pBiasData64;
+        valign vaBias = IVP_LA2NX8_PP(pdvecBias64);
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vecNx48 accSum11, accSum21;
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1);
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1);
+
+          phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+            xb_vecN_2x32v hvecCoeffData11;
+            xb_vecN_2x32v hvecCoeffData21;
+
+            /* vecInData11 refers to 1st input row, first 32(or lesser) elements
+             * and vecInData12 refers to next few left out elements of the same row
+             * required to compute one 32 way output vector(To compute one 32 way
+             * output vector, we require 32 + edge1 + edge2 number of input elements)
+             */
+            xb_vecNx16 vecData1, vecData2, vecData3, vecData4;
+            xb_vecNx16 vecData5, vecData6, vecData7, vecData8;
+            xb_vecNx16 vecData9, vecData10, vecData11, vecData12;
+            xb_vecNx16 vecInData11, vecInData12;
+            xb_vecNx16 vecInData21, vecInData22;
+
+            pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2);
+            pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + \
+                                      stride * inDataPitch1 * enable2ndRow);
+
+            for (ky = 0; ky < kHeightU; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+              valign vaInData = IVP_LANX16_PP(pvecIn1);
+              IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* loads Next(5th) input row, corresponding to 2nd output row */
+              vaInData = IVP_LANX16_PP(pvecIn2);
+              IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+               * If 1st input row is 0,1,2,3,...63, and the 2nd input row is
+               * 64,65,66,67.........127, Data should be arranged  as
+               *
+               * vecData1 : 0, 4, 8,...56,60,  64,68,72,...120,124
+               * vecData2 : 1, 5, 9,...57,61,  65,69,73,...121,125
+               * vecData3 : 2, 6,10,...58,62,  66,70,74,...122,126
+               * vecData4 : 3, 7,11,...59,63,  67,71,75,...123,127
+               *
+               * Lower half of the vectors contain data from 1st output row and
+               * upper half of the vectors contain data from 2nd output row.
+               */
+
+              IVP_DSELNX16(vecData2, vecData1,
+                           IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_0),
+                           IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_0),
+                           IVP_SEQ2NX8());
+              IVP_DSELNX16(vecData4, vecData3,
+                           IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_2),
+                           IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_2),
+                           IVP_SEQ2NX8());
+
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData2, vecData1, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum21, vecData2, vecData1, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+              /* multiples loaded input data with 2nd two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData4, vecData3, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum21, vecData4, vecData3, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+
+              /* right rotate the input vectors by 2 elements
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              vecData5 = IVP_SELNX16I(0, vecData1, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData6 = IVP_SELNX16I(0, vecData2, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData7 = IVP_SELNX16I(0, vecData3, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData8 = IVP_SELNX16I(0, vecData4, IVP_SELI_16B_ROTATE_RIGHT_1);
+
+              /* multiples loaded input data with 3rd two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData6, vecData5, IVP_EXTRN_2X32(hvecCoeffData11, 2));
+              IVP_MULPAN16XR16(accSum21, vecData6, vecData5, IVP_EXTRN_2X32(hvecCoeffData21, 2));
+              /* multiples loaded input data with 4th two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData8, vecData7, IVP_EXTRN_2X32(hvecCoeffData11, 3));
+              IVP_MULPAN16XR16(accSum21, vecData8, vecData7, IVP_EXTRN_2X32(hvecCoeffData21, 3));
+
+              /* right rotate the input vectors by 2
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              vecData9  = IVP_SELNX16I(0, vecData5, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData10 = IVP_SELNX16I(0, vecData6, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData11 = IVP_SELNX16I(0, vecData7, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData12 = IVP_SELNX16I(0, vecData8, IVP_SELI_16B_ROTATE_RIGHT_1);
+
+              /* multiples loaded input data with 5th two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData10, vecData9, IVP_EXTRN_2X32(hvecCoeffData11, 4));
+              IVP_MULPAN16XR16(accSum21, vecData10, vecData9, IVP_EXTRN_2X32(hvecCoeffData21, 4));
+              /* multiples loaded input data with 6th two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData12, vecData11, IVP_EXTRN_2X32(hvecCoeffData11, 5));
+              IVP_MULPAN16XR16(accSum21, vecData12, vecData11, IVP_EXTRN_2X32(hvecCoeffData21, 5));
+            } /* end of for (ky = 0; ky < kHeightU; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vecNx16 vecOut1Ch, vecOut1L, vecOut1H;
+          xb_vecNx16 vecOut2Ch, vecOut2L, vecOut2H;
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1Ch, accSum11, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2Ch, accSum21, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+#elif DILATED_VQ_CONV_S16 == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1Ch, accSum11, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2Ch, accSum21, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          vecOut1L = IVP_SELNX16I(0, vecOut1Ch, IVP_SELI_16B_EXTRACT_LO_HALVES);
+          vecOut2L = IVP_SELNX16I(0, vecOut2Ch, IVP_SELI_16B_EXTRACT_LO_HALVES);
+          vecOut1H = IVP_SELNX16I(0, vecOut1Ch, IVP_SELI_16B_EXTRACT_HI_HALVES);
+          vecOut2H = IVP_SELNX16I(0, vecOut2Ch, IVP_SELI_16B_EXTRACT_HI_HALVES);
+          /* Storing the first row , first depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2);
+          IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1);
+          IVP_SAVNX16_XP(vecOut1H, vaOutData, pvecOut, 2 * enable2ndRow * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                               enable2ndRow * outDataPitch1));
+          IVP_SAVNX16_XP(vecOut2H, vaOutData, pvecOut, 2 * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          pOutput += 2 * outDataPitch2;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else if (kWidthU > 4)
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* In order to handle odd output height */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+        /* initialize output data pointer */
+        int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)];
+
+        /* initialize input data pointer */
+        int16_t *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int16_t *pCoeff = &pCoeffData[0];
+        pdvecBias64 = (xb_vec2Nx8 *) pBiasData64;
+        valign vaBias = IVP_LA2NX8_PP(pdvecBias64);
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vecNx48 accSum11, accSum21;
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1);
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1);
+
+          phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+            xb_vecN_2x32v hvecCoeffData11;
+            xb_vecN_2x32v hvecCoeffData21;
+
+            /* vecInData11 refers to 1st input row, first 32(or lesser) elements
+             * and vecInData12 refers to next few left out elements of the same row
+             * required to compute one 32 way output vector(To compute one 32 way
+             * output vector, we require 32 + edge1 + edge2 number of input elements)
+             */
+            xb_vecNx16 vecData1, vecData2, vecData3, vecData4;
+            xb_vecNx16 vecData5, vecData6, vecData7, vecData8;
+            xb_vecNx16 vecInData11, vecInData12;
+            xb_vecNx16 vecInData21, vecInData22;
+
+            pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2);
+            pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + \
+                                      stride * inDataPitch1 * enable2ndRow);
+
+            for (ky = 0; ky < kHeightU; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+              valign vaInData = IVP_LANX16_PP(pvecIn1);
+              IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* loads Next(5th) input row, corresponding to 2nd output row */
+
+              vaInData = IVP_LANX16_PP(pvecIn2);
+              IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+               * If 1st input row is 0,1,2,3,...63, and the 2nd input row is
+               * 64,65,66,67.........127, Data should be arranged  as
+               *
+               * vecData1 : 0, 4, 8,...56,60,  64,68,72,...120,124
+               * vecData2 : 1, 5, 9,...57,61,  65,69,73,...121,125
+               * vecData3 : 2, 6,10,...58,62,  66,70,74,...122,126
+               * vecData4 : 3, 7,11,...59,63,  67,71,75,...123,127
+               *
+               * Lower half of the vectors contain data from 1st output row and
+               * upper half of the vectors contain data from 2nd output row.
+               */
+
+              IVP_DSELNX16(vecData2, vecData1,
+                           IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_0),
+                           IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_0),
+                           IVP_SEQ2NX8());
+              IVP_DSELNX16(vecData4, vecData3,
+                           IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_2),
+                           IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_2),
+                           IVP_SEQ2NX8());
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData2, vecData1, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum21, vecData2, vecData1, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+              /* multiples loaded input data with 2nd two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData4, vecData3, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum21, vecData4, vecData3, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+
+              /* right rotate the input vectors by 2 elements
+               * in order to multiply with next column of
+               * coeff in the next iteration
+               */
+              vecData5 = IVP_SELNX16I(0, vecData1, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData6 = IVP_SELNX16I(0, vecData2, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData7 = IVP_SELNX16I(0, vecData3, IVP_SELI_16B_ROTATE_RIGHT_1);
+              vecData8 = IVP_SELNX16I(0, vecData4, IVP_SELI_16B_ROTATE_RIGHT_1);
+
+              /* multiples loaded input data with 3rd two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData6, vecData5, IVP_EXTRN_2X32(hvecCoeffData11, 2));
+              IVP_MULPAN16XR16(accSum21, vecData6, vecData5, IVP_EXTRN_2X32(hvecCoeffData21, 2));
+              /* multiples loaded input data with 4th two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData8, vecData7, IVP_EXTRN_2X32(hvecCoeffData11, 3));
+              IVP_MULPAN16XR16(accSum21, vecData8, vecData7, IVP_EXTRN_2X32(hvecCoeffData21, 3));
+            } /* end of for (ky = 0; ky < kHeightU; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vecNx16 vecOut1Ch, vecOut1L, vecOut1H;
+          xb_vecNx16 vecOut2Ch, vecOut2L, vecOut2H;
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1Ch, accSum11, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2Ch, accSum21, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+#elif DILATED_VQ_CONV_S16 == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1Ch, accSum11, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2Ch, accSum21, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          vecOut1L = IVP_SELNX16I(0, vecOut1Ch, IVP_SELI_16B_EXTRACT_LO_HALVES);
+          vecOut2L = IVP_SELNX16I(0, vecOut2Ch, IVP_SELI_16B_EXTRACT_LO_HALVES);
+          vecOut1H = IVP_SELNX16I(0, vecOut1Ch, IVP_SELI_16B_EXTRACT_HI_HALVES);
+          vecOut2H = IVP_SELNX16I(0, vecOut2Ch, IVP_SELI_16B_EXTRACT_HI_HALVES);
+          /* Storing the first row , first depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2);
+          IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1);
+          IVP_SAVNX16_XP(vecOut1H, vaOutData, pvecOut, 2 * enable2ndRow * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                               enable2ndRow * outDataPitch1));
+          IVP_SAVNX16_XP(vecOut2H, vaOutData, pvecOut, 2 * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          pOutput += 2 * outDataPitch2;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  else
+  {
+    /* loop across output channels is unrolled twice
+     * to produce two output channels in 1 iteration.
+     * Also loop across output height by 2 , thereby
+     * producing 4 output vectors simultaneously.
+     */
+    for (x = 0; x < outW; x += vectorizationWidth)   /* Loop across Output width */
+    {
+      /* out of bound flag */
+      int32_t flag = XT_SALT(XCHAL_IVPN_SIMD_WIDTH, inW - stride * x);
+
+      for (y = 0; y < outH; y += 2)    /* Loop across Output height */
+      {
+        /* In order to handle odd output height */
+        int32_t enable2ndRow = XT_SALT(y, outH - 1);
+        /* initialize output data pointer */
+        int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)];
+
+        /* initialize input data pointer */
+        int16_t *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)];
+
+        /* initialize coeff and bias data pointer*/
+        int16_t *pCoeff = &pCoeffData[0];
+        pdvecBias64 = (xb_vec2Nx8 *) pBiasData64;
+        valign vaBias = IVP_LA2NX8_PP(pdvecBias64);
+
+        for (outCh = 0; outCh < numOutCh; outCh += 2)   /* Loop across Output depth */
+        {
+          /* handles odd output channel */
+          int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1);
+
+          /* wide vectors(accumulators) initialized with bias */
+          xb_vecNx48 accSum11, accSum21;
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1);
+          ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh);
+
+          /* priming of coeff load is done outside the innermost loop*/
+          phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff);
+          valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1);
+
+          phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh);
+          valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2);
+
+          for (inCh = 0; inCh < numInCh; inCh++)   /* Loop across input channels */
+          {
+            /* variable declarations for input and coeff vectors */
+            xb_vecN_2x32v hvecCoeffData11;
+            xb_vecN_2x32v hvecCoeffData21;
+
+            /* vecInData11 refers to 1st input row, first 32(or lesser) elements
+             * and vecInData12 refers to next few left out elements of the same row
+             * required to compute one 32 way output vector(To compute one 32 way
+             * output vector, we require 32 + edge1 + edge2 number of input elements)
+             */
+            xb_vecNx16 vecData1, vecData2, vecData3, vecData4;
+            xb_vecNx16 vecInData11, vecInData12;
+            xb_vecNx16 vecInData21, vecInData22;
+
+            pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2);
+            pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + \
+                                      stride * inDataPitch1 * enable2ndRow);
+
+            for (ky = 0; ky < kHeightU; ky++)   /* Loop across kernel height */
+            {
+              /* loads 1st input row */
+              valign vaInData = IVP_LANX16_PP(pvecIn1);
+              IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* loads Next(5th) input row, corresponding to 2nd output row */
+
+              vaInData = IVP_LANX16_PP(pvecIn2);
+              IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag);
+              IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag));
+
+              /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here
+               * If 1st input row is 0,1,2,3,...63, and the 2nd input row is
+               * 64,65,66,67.........127, Data should be arranged  as
+               *
+               * vecData1 : 0, 4, 8,...56,60,  64,68,72,...120,124
+               * vecData2 : 1, 5, 9,...57,61,  65,69,73,...121,125
+               * vecData3 : 2, 6,10,...58,62,  66,70,74,...122,126
+               * vecData4 : 3, 7,11,...59,63,  67,71,75,...123,127
+               *
+               * Lower half of the vectors contain data from 1st output row and
+               * upper half of the vectors contain data from 2nd output row.
+               */
+
+              IVP_DSELNX16(vecData2, vecData1,
+                           IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_0),
+                           IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_0),
+                           IVP_SEQ2NX8());
+              IVP_DSELNX16(vecData4, vecData3,
+                           IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_2),
+                           IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_2),
+                           IVP_SEQ2NX8());
+              /* load 1 row of coeff for 1st output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2);
+
+              /* load 1 row of coeff for 2nd output channel */
+              IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2);
+
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData2, vecData1, IVP_EXTRN_2X32(hvecCoeffData11, 0));
+              IVP_MULPAN16XR16(accSum21, vecData2, vecData1, IVP_EXTRN_2X32(hvecCoeffData21, 0));
+              /* multiples loaded input data with first two coeff */
+              IVP_MULPAN16XR16(accSum11, vecData4, vecData3, IVP_EXTRN_2X32(hvecCoeffData11, 1));
+              IVP_MULPAN16XR16(accSum21, vecData4, vecData3, IVP_EXTRN_2X32(hvecCoeffData21, 1));
+            } /* for (ky = 0; ky < kHeightU; ky++)*/
+          }   /* end of for (inCh = 0; inCh < numInCh; inCh++)*/
+
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vecNx16 vecOut1Ch, vecOut1L, vecOut1H;
+          xb_vecNx16 vecOut2Ch, vecOut2L, vecOut2H;
+#if DILATED_VQ_CONV_S16 == VQ_TRUE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1Ch, accSum11, packShiftAccU, \
+                                            pOutScaleData[outCh], outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2Ch, accSum21, packShiftAccU, \
+                                            pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim);
+#elif DILATED_VQ_CONV_S16 == VQ_FALSE
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1Ch, accSum11, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2Ch, accSum21, packShiftAccU, \
+                                            outScale, outShiftU, minLim, maxLim);
+#endif
+          /* variable store count */
+          varLen = XT_MIN(outW - x, vectorizationWidth);
+
+          vecOut1L = IVP_SELNX16I(0, vecOut1Ch, IVP_SELI_16B_EXTRACT_LO_HALVES);
+          vecOut2L = IVP_SELNX16I(0, vecOut2Ch, IVP_SELI_16B_EXTRACT_LO_HALVES);
+          vecOut1H = IVP_SELNX16I(0, vecOut1Ch, IVP_SELI_16B_EXTRACT_HI_HALVES);
+          vecOut2H = IVP_SELNX16I(0, vecOut2Ch, IVP_SELI_16B_EXTRACT_HI_HALVES);
+          /* Storing the first row , first depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput);
+          valign vaOutData = IVP_ZALIGN();
+          IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the first row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2);
+          IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 1st depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1);
+          IVP_SAVNX16_XP(vecOut1H, vaOutData, pvecOut, 2 * enable2ndRow * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          /* Storing the 2nd row , 2nd depth output */
+          pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \
+                                               enable2ndRow * outDataPitch1));
+          IVP_SAVNX16_XP(vecOut2H, vaOutData, pvecOut, 2 * \
+                         enable2ndRow * enable2ndCh * varLen);
+          IVP_SAPOSNX16_FP(vaOutData, pvecOut);
+
+          pOutput += 2 * outDataPitch2;
+          pCoeff  += 2 * coeffPitch3;
+        } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/
+      }   /* end of for (y = 0; y < outH; y += 2)*/
+    }     /* end of for (x = 0; x < outW; x += vectorizationWidth)*/
+  }
+  return(XAI_ERROR_STATUS());
+}
+#endif /*if ((XCHAL_VISION_TYPE >= 6))*/
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_SO.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_SO.c
new file mode 100644
index 00000000000..9915e6649d6
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_SO.c
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#undef DILATED_SO_VQ_CONV
+
+#define INPUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_dilated_conv_SO.h"
+#undef INPUT_DATA_TYPE
+
+#define INPUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_dilated_conv_SO.h"
+#endif //if ((XCHAL_VISION_TYPE >= 6))
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_SO.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_SO.h
new file mode 100644
index 00000000000..cc937dc2171
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_SO.h
@@ -0,0 +1,1027 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER_IDT, suffix)  name ## _ ## MORPH_FNAME_SPECIFIER_IDT ## suffix
+
+#if INPUT_DATA_TYPE == UNSIGNED8BIT
+
+#define MAKE_NAME(name, suffix)  MAKE_NAME_IMPL(name, U8, suffix)
+#define MORPH_IDT_CHECK              XAI_CHECK_TILE3D_U8
+#define MORPH_IDT_SCALAR             uint8_t
+#define MORPH_IDT_2Nx8               xb_vec2Nx8U
+#define MORPH_OP_PRIME_2Nx8          IVP_LA2NX8U_PP
+#define MORPH_OP_ALIGN_LOAD_2Nx8     IVP_LV2NX8U_XP
+#define MORPH_OP_LOAD_2Nx8           IVP_LA2NX8U_XP
+#define MORPH_OP_LOAD_2Nx8_IP        IVP_LA2NX8U_IP
+#define MORPH_OP_LOAD_2Nx8_VARIABLE  IVP_LAV2NX8U_XP
+#define MORPH_OP_MULA                IVP_MULUSA2N8XR16
+#define MORPH_OP_MULPA               IVP_MULUSPA2NX8
+
+
+#elif INPUT_DATA_TYPE == SIGNED8BIT
+
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_2Nx8
+#undef MORPH_OP_PRIME_2Nx8
+#undef MORPH_OP_ALIGN_LOAD_2Nx8
+#undef MORPH_OP_LOAD_2Nx8_IP
+#undef MORPH_OP_LOAD_2Nx8_VARIABLE
+#undef MORPH_OP_LOAD_2Nx8
+#undef MORPH_OP_MULA
+#undef MORPH_OP_MULPA
+
+
+#define MAKE_NAME(name, suffix)  MAKE_NAME_IMPL(name, S8, suffix)
+#define MORPH_IDT_CHECK              XAI_CHECK_TILE3D_S8
+#define MORPH_IDT_SCALAR             int8_t
+#define MORPH_IDT_2Nx8               xb_vec2Nx8
+#define MORPH_OP_PRIME_2Nx8          IVP_LA2NX8_PP
+#define MORPH_OP_ALIGN_LOAD_2Nx8     IVP_LV2NX8_XP
+#define MORPH_OP_LOAD_2Nx8           IVP_LA2NX8_XP
+#define MORPH_OP_LOAD_2Nx8_IP        IVP_LA2NX8_IP
+#define MORPH_OP_LOAD_2Nx8_VARIABLE  IVP_LAV2NX8_XP
+#define MORPH_OP_MULA                IVP_MULA2N8XR16
+#define MORPH_OP_MULPA               IVP_MULPA2NX8
+#endif
+
+/******************************************************************************************
+* SO(Single output) variants
+******************************************************************************************/
+/* convolved3D_S_MxN_S8S8IXCa2_SO_DWH_INPUTNOEDGE                      */
+/* convolved3D_S_MxN_U8S8IXCa2_SO_DWH_INPUTNOEDGE                      */
+/***********************************************************************/
+/* Description : P6 Optimized implementation of 3D convolution in SO   */
+/*               for cases where                                       */
+/*               . there are no edges along depth for input tile       */
+/*                 and coeff tile                                      */
+/*               . dilation = 1                                        */
+/*               . dim2pitch of coeff tile is a multiple of 64         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,         */
+/*               CNN convolution params structure                      */
+/* Outputs     : XI Error Code                                         */
+/* InOuts      : Output Tile                                           */
+/* Assumptions : InData is S8/U8                                       */
+/*               CoeffData is S8                                       */
+/*               OutData is S8 / U8 / S16                              */
+/*               Kernel Size is close to that of Input Size.           */
+/*               Input and Output is in DWH format.                    */
+/*               Coeff is in DWHN format.                              */
+/*               dim1Size of Input Tile is equal to dim1Pitch of Input */
+/*               Tile.                                                 */
+/***********************************************************************/
+#ifdef DILATED_SO_VQ_CONV
+static _XAI_INLINE_ void MAKE_NAME(convolvedVQ3D_S_MxN, S8IXCa2_SO_DWH_INPUTNOEDGE) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params * param
+  )
+#else
+static _XAI_INLINE_ void MAKE_NAME(convolved3D_S_MxN, S8IXCa2_SO_DWH_INPUTNOEDGE) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params * param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM2(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM3(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+#ifdef DILATED_SO_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  const uint8_t outShiftU    = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu   = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag  = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t dilation     = XAI_CNN_CONV_GET_DILATION(param);
+  const uint8_t strideX      = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY      = XAI_CNN_CONV_GET_STRIDEY(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR *pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  /* Pitches of Coefficient Data (DWHN) in dim2 and dim3 */
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t dilatedKWidthU  = dilation * (kWidthU - 1) + 1;
+  int32_t dilatedKHeightU = dilation * (kHeightU - 1) + 1;
+  int32_t leftEdge, topEdge;
+  if ((dilatedKWidthU % 2) != 0)
+  {
+    leftEdge = dilatedKWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1);
+  }
+
+  if ((dilatedKHeightU % 2) != 0)
+  {
+    topEdge = dilatedKHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1);
+  }
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch2 + leftEdge * inDataPitch1)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  int32_t outCh, k, x, y, ky;
+
+  MORPH_IDT_2Nx8* restrict pdvecData;
+  MORPH_IDT_2Nx8* restrict pdvecData1;
+  MORPH_IDT_2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+  xb_vec2Nx8* restrict pdvecCoeff3;
+  xb_vec2Nx8* restrict pdvecCoeff4;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vecN_2x32v* restrict phvecBias;
+
+  valign vaOutData = IVP_ZALIGN();
+  if (numOutCh * outW * outH == 1 && kHeightU * kWidthU == 1 && (numInCh & (4 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+  {
+#ifdef DILATED_SO_VQ_CONV
+    const uint16_t outScale = ((int16_t *) pOutScaleData)[0];
+#endif
+
+    /* Initialize Accumulator */
+    xb_vec2Nx24 daccSum1 = 0;
+
+    /* Input, Output and Coefficient Pointers */
+    int8_t *pOut           = pOutData;
+    MORPH_IDT_SCALAR * pIn = pInData;
+    int8_t *pCoeff1        = pCoeffData;
+
+    pdvecData   = (MORPH_IDT_2Nx8 *) (pIn);
+    pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff1);
+
+    /* Priming Load for Input Data */
+    valign vaData = MORPH_OP_PRIME_2Nx8(pdvecData);
+
+    /* Multiplying and Accumulating 4 * XCHAL_IVPN_SIMD_WIDTH bytes at a time using PMULs */
+    for (k = 0; k < numInCh; k += 4 * XCHAL_IVPN_SIMD_WIDTH)
+    {
+      /* Input Data Load */
+      MORPH_IDT_2Nx8 dvecData1; MORPH_OP_LOAD_2Nx8_IP(dvecData1, vaData, pdvecData);
+      MORPH_IDT_2Nx8 dvecData2; MORPH_OP_LOAD_2Nx8_IP(dvecData2, vaData, pdvecData);
+
+      /* Coefficient Data Load */
+      xb_vec2Nx8 dvecCoeff11; IVP_LV2NX8_IP(dvecCoeff11, pdvecCoeff1, 2 * XCHAL_IVPN_SIMD_WIDTH);
+      xb_vec2Nx8 dvecCoeff12; IVP_LV2NX8_IP(dvecCoeff12, pdvecCoeff1, 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+      /* Pair Multiply and Accumulates */
+      MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11);
+    }
+
+    /* Reduction Addition and Bias Addition */
+    xb_vecN_2x32v hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum1), \
+                                               IVP_CVT32S2NX24HL(daccSum1));
+    xb_vecN_2x32v hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum1), \
+                                               IVP_CVT32S2NX24LL(daccSum1));
+    int32_t sum1 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower));
+
+
+    sum1 += pBiasData[0];
+    xb_vecN_2x32v hvecOut = (xb_vecN_2x32v) sum1;
+
+    /* Truncate to 24-bit values */
+    daccSum1 = IVP_CVT24UNX32L(hvecOut, hvecOut);
+
+    xb_vecNx16 outData = IVP_PACKVR2NX24_0(daccSum1, packShiftAccU);
+    xb_vecNx48 m_wvec  = IVP_MULUSNX16((xb_vecNx16U) outScale, outData);
+    outData = IVP_PACKVRNX48(m_wvec, outShiftU);
+    outData = IVP_MAXNX16(IVP_MINNX16(outData, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);
+
+    /* Save the output values */
+    pdvecOut = (xb_vec2Nx8 *) (pOut);
+    IVP_SAV2NX8_XP(IVP_MOV2NX8_FROMNX16(outData), vaOutData, pdvecOut, bytesPerPixel);
+    IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+  }
+  else
+  {
+    /* Output Channels Loop is unrolled by 4 */
+    for (outCh = 0; outCh < numOutCh - 3; outCh += 4) /* Output Channels Loop */
+    {
+#ifdef DILATED_SO_VQ_CONV
+      xb_vecNx16U outScaleData, outScaleDataEven, outScaleDataOdd;
+      valign vascale;
+      //Load output scale values
+      vascale = IVP_LANX16U_PP(pOutScaleData);
+      IVP_LAVNX16_XP(outScaleData, vascale, pOutScaleData, 8);
+      outScaleDataEven = IVP_SELNX16UI(outScaleData,
+                                       outScaleData,
+                                       IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0);
+      outScaleDataOdd = IVP_SELNX16UI(outScaleData,
+                                      outScaleData,
+                                      IVP_SELI_16B_EXTRACT_1_OF_2_OFF_1);
+#endif
+      for (y = 0; y < outH; y++) /* Output Height Loop */
+      {
+        for (x = 0; x < outW; x++) /* Output Width Loop */
+        {
+          /* Initialize Accumulator */
+          xb_vec2Nx24 daccSum1 = 0;
+          xb_vec2Nx24 daccSum2 = 0;
+          xb_vec2Nx24 daccSum3 = 0;
+          xb_vec2Nx24 daccSum4 = 0;
+
+          int8_t *pOut = pOutData + (outCh + x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+          /* Input and Coefficient Pointers */
+          MORPH_IDT_SCALAR * pIn = (pInData + x * strideX * inDataPitch1 + (y * strideY) * inDataPitch2);
+          int8_t *pCoeff1        = (pCoeffData + outCh * coeffPitch3);
+          int8_t *pCoeff2        = (pCoeffData + (outCh + 1) * coeffPitch3);
+          int8_t *pCoeff3        = (pCoeffData + (outCh + 2) * coeffPitch3);
+          int8_t *pCoeff4        = (pCoeffData + (outCh + 3) * coeffPitch3);
+
+          for (ky = 0; ky < kHeightU; ky++) /* Kernel Height Loop */
+          {
+            pdvecData1  = (MORPH_IDT_2Nx8 *) (pIn);
+            pdvecData2  = (MORPH_IDT_2Nx8 *) (pIn + 2 * XCHAL_IVPN_SIMD_WIDTH);
+            pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff1);
+            pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff2);
+            pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff3);
+            pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff4);
+
+
+            /* Multiplying and Accumulating 4 * XCHAL_IVPN_SIMD_WIDTH bytes at a time using PMULs */
+            for (k = 0; k < kWidthU * numInCh - 4 * XCHAL_IVPN_SIMD_WIDTH; k += 4 * XCHAL_IVPN_SIMD_WIDTH)
+            {
+              /* Input Data Load */
+              valign vaData1 = MORPH_OP_PRIME_2Nx8(pdvecData1);
+              valign vaData2 = MORPH_OP_PRIME_2Nx8(pdvecData2);
+              MORPH_IDT_2Nx8 dvecData1; MORPH_OP_LOAD_2Nx8(dvecData1, vaData1, pdvecData1, 4 * XCHAL_IVPN_SIMD_WIDTH);
+              MORPH_IDT_2Nx8 dvecData2; MORPH_OP_LOAD_2Nx8(dvecData2, vaData2, pdvecData2, 4 * XCHAL_IVPN_SIMD_WIDTH);
+
+              /* Coefficient Data Load */
+              xb_vec2Nx8 dvecCoeff11; IVP_LV2NX8_IP(dvecCoeff11, pdvecCoeff1, 2 * XCHAL_IVPN_SIMD_WIDTH);
+              xb_vec2Nx8 dvecCoeff12; IVP_LV2NX8_IP(dvecCoeff12, pdvecCoeff1, 2 * XCHAL_IVPN_SIMD_WIDTH);
+              xb_vec2Nx8 dvecCoeff21; IVP_LV2NX8_IP(dvecCoeff21, pdvecCoeff2, 2 * XCHAL_IVPN_SIMD_WIDTH);
+              xb_vec2Nx8 dvecCoeff22; IVP_LV2NX8_IP(dvecCoeff22, pdvecCoeff2, 2 * XCHAL_IVPN_SIMD_WIDTH);
+              xb_vec2Nx8 dvecCoeff31; IVP_LV2NX8_IP(dvecCoeff31, pdvecCoeff3, 2 * XCHAL_IVPN_SIMD_WIDTH);
+              xb_vec2Nx8 dvecCoeff32; IVP_LV2NX8_IP(dvecCoeff32, pdvecCoeff3, 2 * XCHAL_IVPN_SIMD_WIDTH);
+              xb_vec2Nx8 dvecCoeff41; IVP_LV2NX8_IP(dvecCoeff41, pdvecCoeff4, 2 * XCHAL_IVPN_SIMD_WIDTH);
+              xb_vec2Nx8 dvecCoeff42; IVP_LV2NX8_IP(dvecCoeff42, pdvecCoeff4, 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+              /* Pair Multiply and Accumulates */
+              MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11);
+              MORPH_OP_MULPA(daccSum2, dvecData2, dvecCoeff22, dvecData1, dvecCoeff21);
+              MORPH_OP_MULPA(daccSum3, dvecData2, dvecCoeff32, dvecData1, dvecCoeff31);
+              MORPH_OP_MULPA(daccSum4, dvecData2, dvecCoeff42, dvecData1, dvecCoeff41);
+            }
+            /* Corner case handling if numInCh  is not a multiple of 4 * XCHAL_IVPN_SIMD_WIDTH */
+
+            int32_t remK = kWidthU * numInCh - k;
+            /* remLoad is set to 1 if kWidthU * numInCh - k is greater than 64*/
+            int32_t remLoad = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, kWidthU * numInCh - k);
+
+            /* Input Data Load */
+            valign vaData1 = MORPH_OP_PRIME_2Nx8(pdvecData1);
+            xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, remK);
+            xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData1, pdvecData1, remK - 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+            /* Coefficient Data Load */
+            xb_vec2Nx8 dvecCoeff11; IVP_LV2NX8_XP(dvecCoeff11, pdvecCoeff1, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+            xb_vec2Nx8 dvecCoeff12; IVP_LV2NX8_XP(dvecCoeff12, pdvecCoeff1, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+            xb_vec2Nx8 dvecCoeff21; IVP_LV2NX8_XP(dvecCoeff21, pdvecCoeff2, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+            xb_vec2Nx8 dvecCoeff22; IVP_LV2NX8_XP(dvecCoeff22, pdvecCoeff2, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+            xb_vec2Nx8 dvecCoeff31; IVP_LV2NX8_XP(dvecCoeff31, pdvecCoeff3, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+            xb_vec2Nx8 dvecCoeff32; IVP_LV2NX8_XP(dvecCoeff32, pdvecCoeff3, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+            xb_vec2Nx8 dvecCoeff41; IVP_LV2NX8_XP(dvecCoeff41, pdvecCoeff4, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+            xb_vec2Nx8 dvecCoeff42; IVP_LV2NX8_XP(dvecCoeff42, pdvecCoeff4, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+            /* Pair Multiply and Accumulates */
+            MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11);
+            MORPH_OP_MULPA(daccSum2, dvecData2, dvecCoeff22, dvecData1, dvecCoeff21);
+            MORPH_OP_MULPA(daccSum3, dvecData2, dvecCoeff32, dvecData1, dvecCoeff31);
+            MORPH_OP_MULPA(daccSum4, dvecData2, dvecCoeff42, dvecData1, dvecCoeff41);
+
+            /* Update Pointer*/
+            pIn     += inDataPitch2;
+            pCoeff1 += coeffPitch2;
+            pCoeff2 += coeffPitch2;
+            pCoeff3 += coeffPitch2;
+            pCoeff4 += coeffPitch2;
+          } /* End Kernel Height Loop */
+
+          /* Reduction Addition and Bias Addition */
+          xb_vecN_2x32v hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum1), \
+                                                     IVP_CVT32S2NX24HL(daccSum1));
+          xb_vecN_2x32v hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum1), \
+                                                     IVP_CVT32S2NX24LL(daccSum1));
+          int32_t sum1 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower));
+
+          /* Reduction Addition and Bias Addition */
+          hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum2), IVP_CVT32S2NX24HL(daccSum2));
+          hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum2), IVP_CVT32S2NX24LL(daccSum2));
+          int32_t sum2 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower));
+
+          /* Reduction Addition and Bias Addition */
+          hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum3), IVP_CVT32S2NX24HL(daccSum3));
+          hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum3), IVP_CVT32S2NX24LL(daccSum3));
+          int32_t sum3 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower));
+
+          /* Reduction Addition and Bias Addition */
+          hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum4), IVP_CVT32S2NX24HL(daccSum4));
+          hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum4), IVP_CVT32S2NX24LL(daccSum4));
+          int32_t sum4 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower));
+
+          /* Moving all the scalar sums to a 32-bit vector */
+          xb_vecN_2x32v hvecOut = (xb_vecN_2x32v) sum4;
+          hvecOut = IVP_MOVN_2X32T((xb_vecN_2x32v) sum3, hvecOut, IVP_LTRN_2I(3));
+          hvecOut = IVP_MOVN_2X32T((xb_vecN_2x32v) sum2, hvecOut, IVP_LTRN_2I(2));
+          hvecOut = IVP_MOVN_2X32T((xb_vecN_2x32v) sum1, hvecOut, IVP_LTRN_2I(1));
+
+          /* Load bias values corresponding to two outChannels */
+          phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+          valign vaBias = IVP_LAN_2X32_PP(phvecBias);
+          xb_vecN_2x32v hvecBias;  IVP_LAVN_2X32_XP(hvecBias, vaBias, phvecBias, 16);
+          hvecOut = IVP_ADDN_2X32(hvecOut, hvecBias);
+
+          /* Truncate to 24-bit values */
+          daccSum1 = IVP_CVT24UNX32L(hvecOut, hvecOut);
+
+          /* Pack, Scale, Shift and Clamp the accumulator output */
+          xb_vec2Nx8 dvecOutData0L, dvecOutData0H;
+#ifdef DILATED_SO_VQ_CONV
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData0L, dvecOutData0H, daccSum1, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData0L, dvecOutData0H, daccSum1, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Save the output values */
+          pdvecOut = (xb_vec2Nx8 *) (pOut);
+          IVP_SAV2NX8_XP(dvecOutData0L, vaOutData, pdvecOut, 4 * bytesPerPixel);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        } /* End Output Width Loop */
+      }   /* End Output Height Loop */
+    }     /* End Output Channels Loop */
+
+    /* Corner case handling if Number of Output Channels is not a multiple of 4 */
+    if (outCh < numOutCh)
+    {
+#ifdef DILATED_SO_VQ_CONV
+      xb_vecNx16U outScaleData, outScaleDataEven, outScaleDataOdd;
+      valign vascale;
+      //Load output scale values
+      vascale = IVP_LANX16U_PP(pOutScaleData);
+      IVP_LAVNX16_XP(outScaleData, vascale, pOutScaleData, 6);
+      outScaleDataEven = IVP_SELNX16UI(outScaleData,
+                                       outScaleData,
+                                       IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0);
+      outScaleDataOdd = IVP_SELNX16UI(outScaleData,
+                                      outScaleData,
+                                      IVP_SELI_16B_EXTRACT_1_OF_2_OFF_1);
+#endif
+
+      int32_t remOutCh = numOutCh - outCh;
+      for (y = 0; y < outH; y++)
+      {
+        for (x = 0; x < outW; x++)
+        {
+          /* Initialize Accumulator */
+          xb_vec2Nx24 daccSum1 = 0;
+          xb_vec2Nx24 daccSum2 = 0;
+          xb_vec2Nx24 daccSum3 = 0;
+
+          /* Input, Output and Coefficient Pointers */
+          int8_t *pOut           = pOutData + (outCh + x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+          MORPH_IDT_SCALAR * pIn = (pInData + x * strideX * inDataPitch1 + \
+                                    (y * strideY) * inDataPitch2);
+          int8_t *pCoeff1 = (pCoeffData + outCh * coeffPitch3);
+          int8_t *pCoeff2 = (pCoeffData + (outCh + XT_MIN(1, remOutCh - 1)) * coeffPitch3);
+          int8_t *pCoeff3 = (pCoeffData + (outCh + XT_MIN(2, remOutCh - 1)) * coeffPitch3);
+
+          for (ky = 0; ky < kHeightU; ky++) /* Kernel Height Loop */
+          {
+            pdvecData   = (MORPH_IDT_2Nx8 *) (pIn);
+            pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff1);
+            pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff2);
+            pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff3);
+
+            /* Priming Load for Input Data */
+            valign vaData = MORPH_OP_PRIME_2Nx8(pdvecData);
+
+            /* Multiplying and Accumulating 128 bytes at a time using PMULs */
+            for (k = 0; k < kWidthU * numInCh - 4 * XCHAL_IVPN_SIMD_WIDTH; k += 4 * XCHAL_IVPN_SIMD_WIDTH)
+            {
+              /* Input Data Load */
+              MORPH_IDT_2Nx8 dvecData1; MORPH_OP_LOAD_2Nx8_IP(dvecData1, vaData, pdvecData);
+              MORPH_IDT_2Nx8 dvecData2; MORPH_OP_LOAD_2Nx8_IP(dvecData2, vaData, pdvecData);
+
+              /* Coefficient Data Load */
+              xb_vec2Nx8 dvecCoeff11; IVP_LV2NX8_IP(dvecCoeff11, pdvecCoeff1, 2 * XCHAL_IVPN_SIMD_WIDTH);
+              xb_vec2Nx8 dvecCoeff12; IVP_LV2NX8_IP(dvecCoeff12, pdvecCoeff1, 2 * XCHAL_IVPN_SIMD_WIDTH);
+              xb_vec2Nx8 dvecCoeff21; IVP_LV2NX8_IP(dvecCoeff21, pdvecCoeff2, 2 * XCHAL_IVPN_SIMD_WIDTH);
+              xb_vec2Nx8 dvecCoeff22; IVP_LV2NX8_IP(dvecCoeff22, pdvecCoeff2, 2 * XCHAL_IVPN_SIMD_WIDTH);
+              xb_vec2Nx8 dvecCoeff31; IVP_LV2NX8_IP(dvecCoeff31, pdvecCoeff3, 2 * XCHAL_IVPN_SIMD_WIDTH);
+              xb_vec2Nx8 dvecCoeff32; IVP_LV2NX8_IP(dvecCoeff32, pdvecCoeff3, 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+              /* Pair Multiply and Accumulates */
+              MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11);
+              MORPH_OP_MULPA(daccSum2, dvecData2, dvecCoeff22, dvecData1, dvecCoeff21);
+              MORPH_OP_MULPA(daccSum3, dvecData2, dvecCoeff32, dvecData1, dvecCoeff31);
+            }
+            int32_t remK = kWidthU * numInCh - k;
+            /* remLoad is set to 1 if kWidthU * numInCh - k is greater than 64*/
+            int32_t remLoad = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, kWidthU * numInCh - k);
+
+            /* Input Data Load */
+            xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData, pdvecData, remK);
+            xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData, pdvecData, remK - 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+            /* Coefficient Data Load */
+            xb_vec2Nx8 dvecCoeff11; IVP_LV2NX8_XP(dvecCoeff11, pdvecCoeff1, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+            xb_vec2Nx8 dvecCoeff12; IVP_LV2NX8_XP(dvecCoeff12, pdvecCoeff1, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+            xb_vec2Nx8 dvecCoeff21; IVP_LV2NX8_XP(dvecCoeff21, pdvecCoeff2, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+            xb_vec2Nx8 dvecCoeff22; IVP_LV2NX8_XP(dvecCoeff22, pdvecCoeff2, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+            xb_vec2Nx8 dvecCoeff31; IVP_LV2NX8_XP(dvecCoeff31, pdvecCoeff3, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+            xb_vec2Nx8 dvecCoeff32; IVP_LV2NX8_XP(dvecCoeff32, pdvecCoeff3, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+            /* Pair Multiply and Accumulates */
+            MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11);
+            MORPH_OP_MULPA(daccSum2, dvecData2, dvecCoeff22, dvecData1, dvecCoeff21);
+            MORPH_OP_MULPA(daccSum3, dvecData2, dvecCoeff32, dvecData1, dvecCoeff31);
+
+            /* Update Pointer*/
+            pIn     += inDataPitch2;
+            pCoeff1 += coeffPitch2;
+            pCoeff2 += coeffPitch2;
+            pCoeff3 += coeffPitch2;
+          } /* End Kernel Height Loop */
+            /* Reduction Addition and Bias Addition */
+          xb_vecN_2x32v hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum1), \
+                                                     IVP_CVT32S2NX24HL(daccSum1));
+          xb_vecN_2x32v hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum1), \
+                                                     IVP_CVT32S2NX24LL(daccSum1));
+          int32_t sum1 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower));
+
+          /* Reduction Addition */
+          hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum2), IVP_CVT32S2NX24HL(daccSum2));
+          hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum2), IVP_CVT32S2NX24LL(daccSum2));
+          int32_t sum2 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower));
+
+          /* Reduction Addition */
+          hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum3), IVP_CVT32S2NX24HL(daccSum3));
+          hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum3), IVP_CVT32S2NX24LL(daccSum3));
+          int32_t sum3 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower));
+
+          /* Moving all the scalar sums to a 32-bit vector */
+          xb_vecN_2x32v hvecOut = (xb_vecN_2x32v) sum3;
+          hvecOut = IVP_MOVN_2X32T((xb_vecN_2x32v) sum2, hvecOut, IVP_LTRN_2I(2));
+          hvecOut = IVP_MOVN_2X32T((xb_vecN_2x32v) sum1, hvecOut, IVP_LTRN_2I(1));
+
+          /* Load bias values corresponding to two outChannels */
+          phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+          valign vaBias = IVP_LAN_2X32_PP(phvecBias);
+          xb_vecN_2x32v hvecBias;  IVP_LAVN_2X32_XP(hvecBias, vaBias, phvecBias, 4 * remOutCh);
+
+          /* Add bias to the accumulated value*/
+          hvecOut = IVP_ADDN_2X32(hvecOut, hvecBias);
+
+          /* Truncate to 24-bit values */
+          daccSum1 = IVP_CVT24UNX32L(hvecOut, hvecOut);
+
+          /* Pack, Scale, Shift and Clamp the accumulator output */
+          xb_vec2Nx8 dvecOutData0L, dvecOutData0H;
+#ifdef DILATED_SO_VQ_CONV
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData0L, dvecOutData0H, daccSum1, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData0L, dvecOutData0H, daccSum1, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Save the output values */
+          pdvecOut = (xb_vec2Nx8 *) (pOut);
+          IVP_SAV2NX8_XP(dvecOutData0L, vaOutData, pdvecOut, remOutCh * bytesPerPixel);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        } /* End Output Width Loop */
+      }   /* End Output Height Loop */
+    }     /* End of if (outCh < numOutCh) */
+  }       /*End else*/
+}
+
+
+/***************************************************************************/
+/*  xaiConvolved(VQ)3D_S_MxN_S8_SO_DWH/xaiConvolve(VQ)3D_S_MxN_U8_SO_DWH     */
+/***************************************************************************/
+
+/***********************************************************************/
+/* Description : P6 Optimized implementation of 3D convolution in SO   */
+/*               Vectorization Approach.                               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,         */
+/*               CNN convolution params structure                      */
+/* Outputs     : XI Error Code                                         */
+/* InOuts      : Output Tile                                           */
+/* Assumptions : InData is S8/U8                                       */
+/*               CoeffData is S8                                       */
+/*               OutData is S8 / U8 / S16                              */
+/*               Kernel Size is close to that of Input Size.           */
+/*               Input and Output is in DWH format.                    */
+/*               Coeff is in DWHN format.                              */
+/***********************************************************************/
+
+/***************** xaiConvolvedVQ3D_S_MxN_S8S8IX_SO_DWH *****************/
+/***************** xaiConvolvedVQ3D_S_MxN_U8S8IX_SO_DWH *****************/
+/****************** xaiConvolved3D_S_MxN_S8S8IX_SO_DWH ******************/
+/****************** xaiConvolved3D_S_MxN_U8S8IX_SO_DWH ******************/
+
+#ifdef DILATED_SO_VQ_CONV
+XAI_ERR_TYPE MAKE_NAME(xaiConvolvedVQ3D_S_MxN, S8IX_SO_DWH) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params * param
+  )
+#else
+XAI_ERR_TYPE MAKE_NAME(xaiConvolved3D_S_MxN, S8IX_SO_DWH) (
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params * param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+    XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+    XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) &&                                      \
+                    ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG,                    \
+                    "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) == 1) ||                                                            \
+                    ((XAI_CNN_CONV_GET_DILATIONX(param) >= 1) &&                                                           \
+                     (XAI_CNN_CONV_GET_STRIDEX(param) == 1)), XAI_ERR_BADARG,                                              \
+                    "\nDilationX = %hhu\nDilationX should be 1. It can be greater than 1 only when strideX is equal to 1", \
+                    XAI_CNN_CONV_GET_DILATIONX(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONY(param) == 1) ||                                                            \
+                    ((XAI_CNN_CONV_GET_DILATIONY(param) >= 1) &&                                                           \
+                     (XAI_CNN_CONV_GET_STRIDEY(param) == 1)), XAI_ERR_BADARG,                                              \
+                    "\nDilationY = %hhu\nDilationY should be 1. It can be greater than 1 only when strideY is equal to 1", \
+                    XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_EDGES_SO(inTile, coeffTile, param);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_DWHN);
+    XAI_CHECK_CONSISTENCY_SO_DWH(inTile, coeffTile, biasArray, outTile, param);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_SO_VQ_CONV
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), XAI_ERR_DATASIZE,                                                      \
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile));
+#endif
+  }
+#ifndef DILATED_SO_VQ_CONV
+  if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0)
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* If
+   * 1) there are no edges along depth (dim1) for input and coeff and dilation = 1
+   * 2) the coeff pointer is aligned to (XCHAL_IVPN_SIMD_WIDTH << 1) and dim2pitch is a multiple of (XCHAL_IVPN_SIMD_WIDTH << 1)
+   * Call MAKE_NAME(convolved3D_S_MxN, S8IXCa2_SO_DWH_INPUTNOEDGE)
+   */
+  if ((XAI_TILE3D_GET_DIM1_PITCH(inTile) == XAI_TILE3D_GET_DIM1(inTile)) &&
+      (XAI_TILE4D_GET_DIM1_PITCH(coeffTile) == XAI_TILE4D_GET_DIM1(coeffTile)) && \
+      (XAI_CNN_CONV_GET_DILATIONX(param) == 1) && (XAI_CNN_CONV_GET_DILATIONY(param) == 1))
+  {
+    if ((XAI_TILE4D_IS_PTR_ALIGNED_2NX8(coeffTile) && \
+         (XAI_TILE4D_GET_DIM2_PITCH(coeffTile) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0))
+    {
+#ifdef DILATED_SO_VQ_CONV
+      MAKE_NAME(convolvedVQ3D_S_MxN, S8IXCa2_SO_DWH_INPUTNOEDGE) (inTile,
+                                                                  coeffTile,
+                                                                  biasArray,
+                                                                  outputScaleArray,
+                                                                  outTile,
+                                                                  param);
+#else
+      MAKE_NAME(convolved3D_S_MxN, S8IXCa2_SO_DWH_INPUTNOEDGE) (inTile,
+                                                                coeffTile,
+                                                                biasArray,
+                                                                outTile,
+                                                                param);
+#endif
+      return(XAI_ERROR_STATUS());
+    }
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM2(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM3(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+#ifdef DILATED_SO_VQ_CONV
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+  const uint8_t outShiftU    = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu   = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag  = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t dilationX    = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY    = XAI_CNN_CONV_GET_DILATIONY(param);
+  const uint8_t strideX      = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY      = XAI_CNN_CONV_GET_STRIDEY(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  MORPH_IDT_SCALAR *pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData          = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData        = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData        = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  /* Pitches of Coefficient Data (DWHN) in dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  int32_t dilatedKWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1;
+  int32_t leftEdge, topEdge;
+  if ((dilatedKWidthU % 2) != 0)
+  {
+    leftEdge = dilatedKWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1);
+  }
+
+  if ((dilatedKHeightU % 2) != 0)
+  {
+    topEdge = dilatedKHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1);
+  }
+
+  /* Move pointer to the start of the active data (including edge) */
+  pInData = &pInData[-(topEdge * inDataPitch2 + leftEdge * inDataPitch1)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  int32_t outCh, inCh, x, y, ky, kx;
+
+  MORPH_IDT_2Nx8* restrict pdvecData;
+  xb_vec2Nx8* restrict pdvecCoeff1;
+  xb_vec2Nx8* restrict pdvecCoeff2;
+  xb_vec2Nx8* restrict pdvecOut;
+
+  valign vaOutData = IVP_ZALIGN();
+
+  /* Output Channels Loop is unrolled by 2 */
+  for (outCh = 0; outCh < numOutCh - 1; outCh += 2) /* Output Channels Loop */
+  {
+#ifdef DILATED_SO_VQ_CONV
+    xb_vecNx16U outScaleData, outScaleDataEven, outScaleDataOdd;
+    valign vascale;
+    //Load output scale values
+    vascale = IVP_LANX16U_PP(pOutScaleData);
+    IVP_LAVNX16_XP(outScaleData, vascale, pOutScaleData, 4);
+    outScaleDataEven = IVP_SELNX16UI(outScaleData,
+                                     outScaleData,
+                                     IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0);
+    outScaleDataOdd = IVP_SELNX16UI(outScaleData,
+                                    outScaleData,
+                                    IVP_SELI_16B_EXTRACT_1_OF_2_OFF_1);
+#endif
+
+    for (y = 0; y < outH; y++) /* Output Height Loop */
+    {
+      for (x = 0; x < outW; x++) /* Output Width Loop */
+      {
+        /* Initialize Accumulator */
+        xb_vec2Nx24 daccSum1 = 0;
+        xb_vec2Nx24 daccSum2 = 0;
+
+        int8_t *pOut = pOutData + (outCh + x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height Loop */
+        {
+          /* Input and Coefficient Pointers */
+          MORPH_IDT_SCALAR * pIn = (pInData + x * strideX * inDataPitch1 + \
+                                    (y * strideY + ky * dilationY) * inDataPitch2);
+          int8_t *pCoeff1 = (pCoeffData + outCh * coeffPitch3 + ky * coeffPitch2);
+          int8_t *pCoeff2 = (pCoeffData + (outCh + 1) * coeffPitch3 + ky * coeffPitch2);
+
+          for (kx = 0; kx < kWidthU; kx++) /* Kernel Width Loop */
+          {
+            pdvecData   = (MORPH_IDT_2Nx8 *) (pIn);
+            pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff1);
+            pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff2);
+
+            /* Priming Loads for Input and Coefficient Data */
+            valign vaData   = MORPH_OP_PRIME_2Nx8(pdvecData);
+            valign vaCoeff1 = IVP_LA2NX8_PP(pdvecCoeff1);
+            valign vaCoeff2 = IVP_LA2NX8_PP(pdvecCoeff2);
+
+            /* Multiplying and Accumulating 4 * XCHAL_IVPN_SIMD_WIDTH bytes at a time using PMULs */
+            for (inCh = 0; inCh < numInCh - 4 * XCHAL_IVPN_SIMD_WIDTH; inCh += 4 * XCHAL_IVPN_SIMD_WIDTH)
+            {
+              /* Input Data Load */
+              MORPH_IDT_2Nx8 dvecData1; MORPH_OP_LOAD_2Nx8_IP(dvecData1, vaData, pdvecData);
+              MORPH_IDT_2Nx8 dvecData2; MORPH_OP_LOAD_2Nx8_IP(dvecData2, vaData, pdvecData);
+
+              /* Coefficient Data Load */
+              xb_vec2Nx8 dvecCoeff11; IVP_LA2NX8_IP(dvecCoeff11, vaCoeff1, pdvecCoeff1);
+              xb_vec2Nx8 dvecCoeff12; IVP_LA2NX8_IP(dvecCoeff12, vaCoeff1, pdvecCoeff1);
+              xb_vec2Nx8 dvecCoeff21; IVP_LA2NX8_IP(dvecCoeff21, vaCoeff2, pdvecCoeff2);
+              xb_vec2Nx8 dvecCoeff22; IVP_LA2NX8_IP(dvecCoeff22, vaCoeff2, pdvecCoeff2);
+
+              /* Pair Multiply and Accumulates */
+              MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11);
+              MORPH_OP_MULPA(daccSum2, dvecData2, dvecCoeff22, dvecData1, dvecCoeff21);
+            }
+            /* Corner case handling if numInCh  is not a multiple of 4 * XCHAL_IVPN_SIMD_WIDTH */
+            int32_t remLength = numInCh - inCh;
+
+            /* Input Data Load */
+            MORPH_IDT_2Nx8 dvecData1;
+            MORPH_OP_LOAD_2Nx8_VARIABLE(dvecData1, vaData, pdvecData, remLength);
+            MORPH_IDT_2Nx8 dvecData2;
+            MORPH_OP_LOAD_2Nx8_VARIABLE(dvecData2, vaData, pdvecData, \
+                                        remLength - 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+            /* Coefficient Data Load */
+            xb_vec2Nx8 dvecCoeff11, dvecCoeff12, dvecCoeff21, dvecCoeff22;
+            IVP_LAV2NX8_XP(dvecCoeff11, vaCoeff1, pdvecCoeff1, remLength);
+            IVP_LAV2NX8_XP(dvecCoeff12, vaCoeff1, pdvecCoeff1, remLength - 2 * XCHAL_IVPN_SIMD_WIDTH);
+            IVP_LAV2NX8_XP(dvecCoeff21, vaCoeff2, pdvecCoeff2, remLength);
+            IVP_LAV2NX8_XP(dvecCoeff22, vaCoeff2, pdvecCoeff2, remLength - 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+            MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11);
+            MORPH_OP_MULPA(daccSum2, dvecData2, dvecCoeff22, dvecData1, dvecCoeff21);
+
+            pIn     += dilationX * inDataPitch1;
+            pCoeff1 += coeffPitch1;
+            pCoeff2 += coeffPitch1;
+          } /* End Kernel Width Loop */
+        }   /* End Kernel Height Loop */
+
+        /* Reduction Addition and Bias Addition */
+        xb_vecN_2x32v hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum1), \
+                                                   IVP_CVT32S2NX24HL(daccSum1));
+        xb_vecN_2x32v hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum1), \
+                                                   IVP_CVT32S2NX24LL(daccSum1));
+        int32_t sum1 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower));
+        sum1 += pBiasData[outCh];
+
+        /* Reduction Addition and Bias Addition */
+        hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum2), IVP_CVT32S2NX24HL(daccSum2));
+        hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum2), IVP_CVT32S2NX24LL(daccSum2));
+        int32_t sum2 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower));
+        sum2 += pBiasData[outCh + 1];
+
+        /* Moving all the scalar sums to a 32-bit vector */
+        xb_vecN_2x32v hvecOut = 0;
+        hvecOut = IVP_MOVN_2X32T((xb_vecN_2x32v) sum2, hvecOut, IVP_LTRN_2I(2));
+        hvecOut = IVP_MOVN_2X32T((xb_vecN_2x32v) sum1, hvecOut, IVP_LTRN_2I(1));
+
+        /* Truncate to 24-bit values */
+        daccSum1 = IVP_CVT24UNX32L(hvecOut, hvecOut);
+
+        /* Pack, Scale, Shift and Clamp the accumulator output */
+        xb_vec2Nx8 dvecOutData0L, dvecOutData0H;
+#ifdef DILATED_SO_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData0L, dvecOutData0H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData0L, dvecOutData0H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Save the output values */
+        pdvecOut = (xb_vec2Nx8 *) (pOut);
+        IVP_SAV2NX8_XP(dvecOutData0L, vaOutData, pdvecOut, 2 * bytesPerPixel);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End Output Width Loop */
+    }   /* End Output Height Loop */
+  }     /* End Output Channels Loop */
+
+  /* Corner case handling if Number of Output Channels is odd */
+  if (outCh < numOutCh)
+  {
+#ifdef DILATED_SO_VQ_CONV
+    xb_vecNx16U outScaleData, outScaleDataEven, outScaleDataOdd;
+    valign vascale;
+    //Load output scale values
+    vascale = IVP_LANX16U_PP(pOutScaleData);
+    IVP_LAVNX16_XP(outScaleData, vascale, pOutScaleData, 2);
+    outScaleDataEven = IVP_SELNX16UI(outScaleData,
+                                     outScaleData,
+                                     IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0);
+    outScaleDataOdd = IVP_SELNX16UI(outScaleData,
+                                    outScaleData,
+                                    IVP_SELI_16B_EXTRACT_1_OF_2_OFF_1);
+#endif
+    for (y = 0; y < outH; y++)
+    {
+      for (x = 0; x < outW; x++)
+      {
+        /* Initialize Accumulator */
+        xb_vec2Nx24 daccSum1 = 0;
+
+        int8_t *pOut = pOutData + (outCh + x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height Loop */
+        {
+          /* Input and Coefficient Pointers */
+          MORPH_IDT_SCALAR * pIn = (pInData + x * strideX * inDataPitch1 + \
+                                    (y * strideY + ky * dilationY) * inDataPitch2);
+          int8_t *pCoeff1 = (pCoeffData + outCh * coeffPitch3 + ky * coeffPitch2);
+
+          for (kx = 0; kx < kWidthU; kx++) /* Kernel Width Loop */
+          {
+            pdvecData   = (MORPH_IDT_2Nx8 *) (pIn);
+            pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff1);
+
+            /* Priming Loads for Input and Coefficient Data */
+            valign vaData   = MORPH_OP_PRIME_2Nx8(pdvecData);
+            valign vaCoeff1 = IVP_LA2NX8_PP(pdvecCoeff1);
+
+            /* Multiplying and Accumulating 4 * XCHAL_IVPN_SIMD_WIDTH bytes at a time using PMULs */
+            for (inCh = 0; inCh < numInCh - 4 * XCHAL_IVPN_SIMD_WIDTH; inCh += 4 * XCHAL_IVPN_SIMD_WIDTH)
+            {
+              /* Input Data Load */
+              MORPH_IDT_2Nx8 dvecData1; MORPH_OP_LOAD_2Nx8_IP(dvecData1, vaData, pdvecData);
+              MORPH_IDT_2Nx8 dvecData2; MORPH_OP_LOAD_2Nx8_IP(dvecData2, vaData, pdvecData);
+
+              /* Coefficient Data Load */
+              xb_vec2Nx8 dvecCoeff11; IVP_LA2NX8_IP(dvecCoeff11, vaCoeff1, pdvecCoeff1);
+              xb_vec2Nx8 dvecCoeff12; IVP_LA2NX8_IP(dvecCoeff12, vaCoeff1, pdvecCoeff1);
+
+              /* Pair Multiply and Accumulates */
+              MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11);
+            }
+            /* Corner case handling if numInCh is not a multiple of 4 * XCHAL_IVPN_SIMD_WIDTH */
+            int32_t remLength = numInCh - inCh;
+
+            /* Input Data Load */
+            MORPH_IDT_2Nx8 dvecData1;
+            MORPH_OP_LOAD_2Nx8_VARIABLE(dvecData1, vaData, pdvecData, remLength);
+            MORPH_IDT_2Nx8 dvecData2;
+            MORPH_OP_LOAD_2Nx8_VARIABLE(dvecData2, vaData, pdvecData, \
+                                        remLength - 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+            /* Coefficient Data Load */
+            xb_vec2Nx8 dvecCoeff11, dvecCoeff12;
+            IVP_LAV2NX8_XP(dvecCoeff11, vaCoeff1, pdvecCoeff1, remLength);
+            IVP_LAV2NX8_XP(dvecCoeff12, vaCoeff1, pdvecCoeff1, remLength - 2 * XCHAL_IVPN_SIMD_WIDTH);
+
+            MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11);
+
+            pIn     += dilationX * inDataPitch1;
+            pCoeff1 += coeffPitch1;
+          } /* End Kernel Width Loop */
+        }   /* End Kernel Height Loop */
+            /* Reduction Addition and Bias Addition */
+        xb_vecN_2x32v hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum1), \
+                                                   IVP_CVT32S2NX24HL(daccSum1));
+        xb_vecN_2x32v hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum1), \
+                                                   IVP_CVT32S2NX24LL(daccSum1));
+        int32_t sum1 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower));
+        sum1 += pBiasData[outCh];
+
+        /* Moving all the scalar sums to a 32-bit vector */
+        xb_vecN_2x32v hvecOut = (xb_vecN_2x32v) sum1;
+
+        /* Truncate to 24-bit values */
+        daccSum1 = IVP_CVT24UNX32L(hvecOut, hvecOut);
+
+        /* Pack, Scale, Shift and Clamp the accumulator output */
+        xb_vec2Nx8 dvecOutData0L, dvecOutData0H;
+#ifdef DILATED_SO_VQ_CONV
+        PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData0L, dvecOutData0H, daccSum1, packShiftAccU, \
+                                         outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+        PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData0L, dvecOutData0H, daccSum1, packShiftAccU, \
+                                      outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+        /* Save the output values */
+        pdvecOut = (xb_vec2Nx8 *) (pOut);
+        IVP_SAV2NX8_XP(dvecOutData0L, vaOutData, pdvecOut, bytesPerPixel);
+        IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+      } /* End Output Width Loop */
+    }   /* End Output Height Loop */
+  }     /* End of if (outCh < numOutCh) */
+
+  return(XAI_ERROR_STATUS());
+}
+
+
+/****************************** end of SO variants *****************************************/
+/*******************************************************************************************/
+#endif //if ((XCHAL_VISION_TYPE >= 6))
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOD.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOD.c
new file mode 100644
index 00000000000..26cdd5dbddf
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOD.c
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+#include "limits.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#define DILATED_VQ_CONV
+#include "cnn_dilated_conv_MOD.h"
+
+/******************************* end of MOD variants ***************************************/
+/*******************************************************************************************/
+#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/
+
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOD_S16.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOD_S16.c
new file mode 100644
index 00000000000..5a8e430f81f
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOD_S16.c
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+#include "limits.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#define DILATED_VQ_CONV_S16
+
+#include "cnn_dilated_conv_MOD_S16.h"
+/******************************* end of MOD variants ***************************************/
+/*******************************************************************************************/
+#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/
+
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOW.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOW.c
new file mode 100644
index 00000000000..35a3c00cc59
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOW.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#define DILATED_VQ_CONV  VQ_TRUE
+
+#define INPUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_dilated_conv_MOW.h"
+#undef INPUT_DATA_TYPE
+
+#define INPUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_dilated_conv_MOW.h"
+
+#undef INPUT_DATA_TYPE
+#endif //if ((XCHAL_VISION_TYPE >= 6))
+
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOW_S16.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOW_S16.c
new file mode 100644
index 00000000000..3559031cb6e
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOW_S16.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#define DILATED_VQ_CONV_S16  VQ_TRUE
+
+#include "cnn_dilated_conv_MOW_S16.h"
+#endif //if ((XCHAL_VISION_TYPE >= 6))
+
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_SO.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_SO.c
new file mode 100644
index 00000000000..6944b553462
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_SO.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+#define DILATED_SO_VQ_CONV
+
+#define INPUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_dilated_conv_SO.h"
+#undef INPUT_DATA_TYPE
+
+#define INPUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_dilated_conv_SO.h"
+
+#undef INPUT_DATA_TYPE
+#endif //if ((XCHAL_VISION_TYPE >= 6))
+
+
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_partial_MOD.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_partial_MOD.c
new file mode 100644
index 00000000000..58390cbc189
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_partial_MOD.c
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+#include "limits.h"
+
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#define  DILATED_VQ_CONV_PARTIAL
+#include "cnn_dilated_conv_partial_MOD.h"
+#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_partial_MOD_S16.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_partial_MOD_S16.c
new file mode 100644
index 00000000000..de61d850b56
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_partial_MOD_S16.c
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+#include "limits.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#define DILATED_VQ_CONV_PARTIAL
+#include "cnn_dilated_conv_partial_MOD_S16.h"
+#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD.c
new file mode 100644
index 00000000000..3031ade2233
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD.c
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+#include "limits.h"
+
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#undef  DILATED_VQ_CONV_PARTIAL
+#include "cnn_dilated_conv_partial_MOD.h"
+#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD.h
new file mode 100644
index 00000000000..fed76eb2064
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD.h
@@ -0,0 +1,7858 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+#include "limits.h"
+
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D partial convolution      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t dilationX     = 1;
+  const uint8_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t inputFlag     = XAI_CNN_CONV_GET_FLAG_INPUT(param);
+  const uint8_t outputFlag    = XAI_CNN_CONV_GET_FLAG_OUTPUT(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  int32_t * pAccData = NULL;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile);
+  }
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Pitch of AccTile Data (DWH) in dim1 and dim2 */
+  int32_t accDataPitch1 = 0;
+  int32_t accDataPitch2 = 0;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile);
+    accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile);
+  }
+
+  int32_t numIter = kWidthU * numInCh;
+
+  int32_t dilatedKWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1;
+  int32_t leftEdge, topEdge;
+  if ((dilatedKWidthU % 2) != 0)
+  {
+    leftEdge = dilatedKWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1);
+  }
+
+  if ((dilatedKHeightU % 2) != 0)
+  {
+    topEdge = dilatedKHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecData1;
+  xb_vec2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecData3;
+  xb_vec2Nx8* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vecN_2x32v* restrict phvecAcc;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV_PARTIAL
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int8_t *pOut  = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2);
+
+        /* Initialize accumulators */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        if (inputFlag) /* Bias Values */
+        {
+          phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+          ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+        }
+        else  /* Accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH;
+          xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH;
+          xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH;
+          xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH;
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          valign vaAcc = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL);
+          IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL);
+          IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL);
+          IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL);
+          IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL);
+        }
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * numX);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideY * inDataPitch2 * numY);
+          pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8_PP(pdvecData4);
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+          for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        if (outputFlag)  /* Store to ouput Tile*/
+        {
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV_PARTIAL
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Store the output dvecOut1 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut2 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut3 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut4 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         remainingOutCh * numX * numY);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+        else /* Store to accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1);
+          xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1);
+          xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1);
+          xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1);
+
+          xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2);
+          xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2);
+          xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2);
+          xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2);
+
+          xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3);
+          xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3);
+          xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3);
+          xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3);
+
+          xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4);
+          xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4);
+          xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4);
+          xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4);
+
+
+          /* Store the hvecAcc1 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc2 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX);
+          IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX);
+          IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc3 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY);
+          IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY);
+          IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the  hvecAcc4 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY));
+          IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY);
+          IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+        }
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D partial convolution      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is U8, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_U8S8IXCa2_MOD_DWH_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_U8S8IXCa2_MOD_DWH_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t dilationX     = 1;
+  const uint8_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t inputFlag     = XAI_CNN_CONV_GET_FLAG_INPUT(param);
+  const uint8_t outputFlag    = XAI_CNN_CONV_GET_FLAG_OUTPUT(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  uint8_t *pInData   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  int32_t * pAccData = NULL;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile);
+  }
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Pitch of AccTile Data (DWH) in dim1 and dim2 */
+  int32_t accDataPitch1 = 0;
+  int32_t accDataPitch2 = 0;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile);
+    accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile);
+  }
+
+  int32_t numIter = kWidthU * numInCh;
+
+  int32_t dilatedKWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1;
+  int32_t leftEdge, topEdge;
+  if ((dilatedKWidthU % 2) != 0)
+  {
+    leftEdge = dilatedKWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1);
+  }
+
+  if ((dilatedKHeightU % 2) != 0)
+  {
+    topEdge = dilatedKHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8U* restrict pdvecData1;
+  xb_vec2Nx8U* restrict pdvecData2;
+  xb_vec2Nx8U* restrict pdvecData3;
+  xb_vec2Nx8U* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vecN_2x32v* restrict phvecAcc;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV_PARTIAL
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int8_t *pOut  = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2);
+
+        /* Initialize accumulators */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        if (inputFlag) /* Bias Values */
+        {
+          phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+          ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+        }
+        else  /* Accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH;
+          xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH;
+          xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH;
+          xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH;
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          valign vaAcc = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL);
+          IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL);
+          IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL);
+          IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL);
+          IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL);
+        }
+
+        /* Input Data and Coeff Data Pointers */
+        uint8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY);
+          pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * numX);
+          pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideY * inDataPitch2 * numY);
+          pdvecData4 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8U_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8U_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8U_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8U_PP(pdvecData4);
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+          for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, 4);
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0);
+#else
+            xb_vec2Nx8 dvecData1;
+            xb_vec2Nx8 dvecData2;
+            xb_vec2Nx8 dvecData3;
+            xb_vec2Nx8 dvecData4;
+
+            dvecData1 = IVP_SUB2NX8U(dvecInp1, 128);
+            dvecData2 = IVP_SUB2NX8U(dvecInp2, 128);
+            dvecData3 = IVP_SUB2NX8U(dvecInp3, 128);
+            dvecData4 = IVP_SUB2NX8U(dvecInp4, 128);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+#endif
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#endif
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        if (outputFlag)  /* Store to ouput Tile*/
+        {
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV_PARTIAL
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Store the output dvecOut1 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut2 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut3 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut4 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         remainingOutCh * numX * numY);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+        else /* Store to accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1);
+          xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1);
+          xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1);
+          xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1);
+
+          xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2);
+          xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2);
+          xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2);
+          xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2);
+
+          xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3);
+          xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3);
+          xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3);
+          xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3);
+
+          xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4);
+          xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4);
+          xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4);
+          xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4);
+
+
+          /* Store the hvecAcc1 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc2 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX);
+          IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX);
+          IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc3 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY);
+          IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY);
+          IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the  hvecAcc4 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY));
+          IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY);
+          IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+        }
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D partial convolution      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_MOD_DWH_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_S8S8IXCa2_MOD_DWH_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t dilationX     = 1;
+  const uint8_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t inputFlag     = XAI_CNN_CONV_GET_FLAG_INPUT(param);
+  const uint8_t outputFlag    = XAI_CNN_CONV_GET_FLAG_OUTPUT(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  int32_t * pAccData = NULL;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile);
+  }
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Pitch of AccTile Data (DWH) in dim1 and dim2 */
+  int32_t accDataPitch1 = 0;
+  int32_t accDataPitch2 = 0;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile);
+    accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile);
+  }
+
+  int32_t numIter = kWidthU * numInCh;
+
+  int32_t dilatedKWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1;
+  int32_t leftEdge, topEdge;
+  if ((dilatedKWidthU % 2) != 0)
+  {
+    leftEdge = dilatedKWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1);
+  }
+
+  if ((dilatedKHeightU % 2) != 0)
+  {
+    topEdge = dilatedKHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecData1;
+  xb_vec2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecData3;
+  xb_vec2Nx8* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vecN_2x32v* restrict phvecAcc;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV_PARTIAL
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int8_t *pOut  = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2);
+
+        /* Initialize accumulators */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        if (inputFlag) /* Bias Values */
+        {
+          phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+          ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+        }
+        else  /* Accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH;
+          xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH;
+          xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH;
+          xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH;
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          valign vaAcc = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL);
+          IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL);
+          IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL);
+          IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL);
+          IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL);
+        }
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * numX);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideY * inDataPitch2 * numY);
+          pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8_PP(pdvecData4);
+
+          for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          }   /* End Input Channels */
+          /* Corner case handling as numIter is not a multiple of 4 */
+          if (k < numIter)
+          {
+            int32_t remInCh = numIter - k;
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          }   /* End if( k < numIter)*/
+        } /* End Kernel Height * Width */
+
+        if (outputFlag)  /* Store to ouput Tile*/
+        {
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV_PARTIAL
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Store the output dvecOut1 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut2 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut3 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut4 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         remainingOutCh * numX * numY);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+        else /* Store to accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1);
+          xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1);
+          xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1);
+          xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1);
+
+          xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2);
+          xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2);
+          xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2);
+          xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2);
+
+          xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3);
+          xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3);
+          xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3);
+          xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3);
+
+          xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4);
+          xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4);
+          xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4);
+          xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4);
+
+
+          /* Store the hvecAcc1 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc2 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX);
+          IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX);
+          IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc3 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY);
+          IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY);
+          IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the  hvecAcc4 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY));
+          IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY);
+          IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+        }
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D partial convolution      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is U8, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_U8S8IXCa2_MOD_DWH_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_U8S8IXCa2_MOD_DWH_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t dilationX     = 1;
+  const uint8_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t inputFlag     = XAI_CNN_CONV_GET_FLAG_INPUT(param);
+  const uint8_t outputFlag    = XAI_CNN_CONV_GET_FLAG_OUTPUT(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  uint8_t *pInData   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  int32_t * pAccData = NULL;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile);
+  }
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Pitch of AccTile Data (DWH) in dim1 and dim2 */
+  int32_t accDataPitch1 = 0;
+  int32_t accDataPitch2 = 0;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile);
+    accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile);
+  }
+
+  int32_t numIter = kWidthU * numInCh;
+
+  int32_t dilatedKWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1;
+  int32_t leftEdge, topEdge;
+  if ((dilatedKWidthU % 2) != 0)
+  {
+    leftEdge = dilatedKWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1);
+  }
+
+  if ((dilatedKHeightU % 2) != 0)
+  {
+    topEdge = dilatedKHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8U* restrict pdvecData1;
+  xb_vec2Nx8U* restrict pdvecData2;
+  xb_vec2Nx8U* restrict pdvecData3;
+  xb_vec2Nx8U* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vecN_2x32v* restrict phvecAcc;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV_PARTIAL
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int8_t *pOut  = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2);
+
+        /* Initialize accumulators */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        if (inputFlag) /* Bias Values */
+        {
+          phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+          ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+        }
+        else  /* Accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH;
+          xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH;
+          xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH;
+          xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH;
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          valign vaAcc = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL);
+          IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL);
+          IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL);
+          IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL);
+          IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL);
+        }
+
+        /* Input Data and Coeff Data Pointers */
+        uint8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY);
+          pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * numX);
+          pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideY * inDataPitch2 * numY);
+          pdvecData4 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8U_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8U_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8U_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8U_PP(pdvecData4);
+
+          for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, 4);
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0);
+#else
+            xb_vec2Nx8 dvecData1;
+            xb_vec2Nx8 dvecData2;
+            xb_vec2Nx8 dvecData3;
+            xb_vec2Nx8 dvecData4;
+
+            dvecData1 = IVP_SUB2NX8U(dvecInp1, 128);
+            dvecData2 = IVP_SUB2NX8U(dvecInp2, 128);
+            dvecData3 = IVP_SUB2NX8U(dvecInp3, 128);
+            dvecData4 = IVP_SUB2NX8U(dvecInp4, 128);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+#endif
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#endif
+          }   /* End Input Channels */
+          /* Corner case handling as numIter is not a multiple of 4 */
+          if (k < numIter)
+          {
+            int32_t remInCh = numIter - k;
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, remInCh);
+            xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, remInCh);
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0);
+#else
+            xb_vec2Nx8 dvecData1 = 0;
+            xb_vec2Nx8 dvecData2 = 0;
+            xb_vec2Nx8 dvecData3 = 0;
+            xb_vec2Nx8 dvecData4 = 0;
+
+            IVP_SUB2NX8UT(dvecData1, dvecInp1, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh));
+            IVP_SUB2NX8UT(dvecData2, dvecInp2, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh));
+            IVP_SUB2NX8UT(dvecData3, dvecInp3, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh));
+            IVP_SUB2NX8UT(dvecData4, dvecInp4, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh));
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+#endif
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#endif
+          }   /* End if( k < numIter)*/
+        } /* End Kernel Height * Width */
+
+        if (outputFlag)  /* Store to ouput Tile*/
+        {
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV_PARTIAL
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Store the output dvecOut1 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut2 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut3 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut4 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         remainingOutCh * numX * numY);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+        else /* Store to accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1);
+          xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1);
+          xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1);
+          xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1);
+
+          xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2);
+          xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2);
+          xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2);
+          xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2);
+
+          xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3);
+          xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3);
+          xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3);
+          xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3);
+
+          xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4);
+          xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4);
+          xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4);
+          xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4);
+
+
+          /* Store the hvecAcc1 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc2 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX);
+          IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX);
+          IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc3 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY);
+          IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY);
+          IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the  hvecAcc4 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY));
+          IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY);
+          IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+        }
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D partial convolution      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+#ifdef DILATED_VQ_CONV_PARTIAL
+static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void partialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW      = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH      = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh   = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh  = XAI_TILE3D_GET_DIM1(outTile);
+  const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU   = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU  = XAI_TILE4D_GET_DIM4(coeffTile);
+  int32_t dilatedkWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedkHeightU = dilationY * (kHeightU - 1) + 1;
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t inputFlag     = XAI_CNN_CONV_GET_FLAG_INPUT(param);
+  const uint8_t outputFlag    = XAI_CNN_CONV_GET_FLAG_OUTPUT(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  int32_t * pAccData = NULL;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile);
+  }
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Pitch of AccTile Data (DWH) in dim1 and dim2 */
+  int32_t accDataPitch1 = 0;
+  int32_t accDataPitch2 = 0;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile);
+    accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile);
+  }
+
+  int32_t leftEdge, topEdge;
+  if ((dilatedkWidthU % 2) != 0)
+  {
+    leftEdge = dilatedkWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedkWidthU / 2) : ((dilatedkWidthU / 2) - 1);
+  }
+
+  if ((dilatedkHeightU % 2) != 0)
+  {
+    topEdge = dilatedkHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedkHeightU / 2) : ((dilatedkHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecData1;
+  xb_vec2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecData3;
+  xb_vec2Nx8* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vecN_2x32v* restrict phvecAcc;
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV_PARTIAL
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int8_t *pOut  = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2);
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        if (inputFlag) /* Bias Values */
+        {
+          phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+          ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+        }
+        else  /* Accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH;
+          xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH;
+          xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH;
+          xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH;
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          valign vaAcc = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL);
+          IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL);
+          IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL);
+          IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL);
+          IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL);
+        }
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        xb_vecN_2x32v hvecInAddrOff    = 0;
+        xb_vecN_2x32v hvecCoeffAddrOff = 0;
+        xb_vecN_2x32v hvecLaneIdx      = 0;
+        int32_t inAddrOff, coeffAddrOff;
+
+        for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */
+        {
+          /* Condition checks performed to get the Input and Coefficient        */
+          /* Pointer Offsets after combining the Kernel Width and Height Loops  */
+          vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU);
+          /* hvecLaneIdx will be reset to zero after every kWidth */
+          hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2);
+          /* InPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2);
+          /* CoeffPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2);
+          /* Extracting Input and Coefficient address offsets */
+          inAddrOff        = IVP_EXTRN_2X32(hvecInAddrOff, 0);
+          coeffAddrOff     = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0);
+          hvecLaneIdx      = IVP_ADDN_2X32(hvecLaneIdx, 1);
+          hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2);
+          hvecInAddrOff    = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX);
+
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + inAddrOff);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * numX);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + inAddrOff + strideY * inDataPitch2 * numY);
+          pdvecData4 = (xb_vec2Nx8 *) (pData + inAddrOff + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff);
+
+          /* Primes registers for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8_PP(pdvecData4);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */
+          {
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          } /* End Input Channels */
+
+          /* Corner Case Handling if number of input channels not multiple of 4 */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh = numInCh - inCh;
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Coefficient Loads */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          } /* End Corner case handling */
+        }   /* End Kernel Height * Width */
+
+        if (outputFlag)  /* Store to ouput Tile*/
+        {
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV_PARTIAL
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Store the output dvecOut1 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut2 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut3 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut4 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+        else /* Store to accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1);
+          xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1);
+          xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1);
+          xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1);
+
+          xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2);
+          xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2);
+          xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2);
+          xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2);
+
+          xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3);
+          xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3);
+          xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3);
+          xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3);
+
+          xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4);
+          xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4);
+          xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4);
+          xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4);
+
+
+          /* Store the hvecAcc1 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc2 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX);
+          IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX);
+          IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc3 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY);
+          IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY);
+          IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the  hvecAcc4 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY));
+          IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY);
+          IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+        }
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D partial convolution      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is U8, CoeffData is S8                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                      const xai_pTile4D coeffTile,
+                                                                      const xai_pArray biasArray,
+                                                                      const xai_pArray outputScaleArray,
+                                                                      xai_pTile3D accTile,
+                                                                      xai_pTile3D outTile,
+                                                                      const xai_cnn_conv_params *param
+                                                                      )
+#else
+static _XAI_INLINE_ void partialConvolved3D_S_MxN_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                    const xai_pTile4D coeffTile,
+                                                                    const xai_pArray biasArray,
+                                                                    xai_pTile3D accTile,
+                                                                    xai_pTile3D outTile,
+                                                                    const xai_cnn_conv_params *param
+                                                                    )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW      = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH      = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh   = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh  = XAI_TILE3D_GET_DIM1(outTile);
+  const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU   = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU  = XAI_TILE4D_GET_DIM4(coeffTile);
+  int32_t dilatedkWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedkHeightU = dilationY * (kHeightU - 1) + 1;
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t inputFlag     = XAI_CNN_CONV_GET_FLAG_INPUT(param);
+  const uint8_t outputFlag    = XAI_CNN_CONV_GET_FLAG_OUTPUT(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  uint8_t *pInData   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  int32_t * pAccData = NULL;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile);
+  }
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Pitch of AccTile Data (DWH) in dim1 and dim2 */
+  int32_t accDataPitch1 = 0;
+  int32_t accDataPitch2 = 0;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile);
+    accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile);
+  }
+
+  int32_t leftEdge, topEdge;
+  if ((dilatedkWidthU % 2) != 0)
+  {
+    leftEdge = dilatedkWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedkWidthU / 2) : ((dilatedkWidthU / 2) - 1);
+  }
+
+  if ((dilatedkHeightU % 2) != 0)
+  {
+    topEdge = dilatedkHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedkHeightU / 2) : ((dilatedkHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8U* restrict pdvecData1;
+  xb_vec2Nx8U* restrict pdvecData2;
+  xb_vec2Nx8U* restrict pdvecData3;
+  xb_vec2Nx8U* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vecN_2x32v* restrict phvecAcc;
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV_PARTIAL
+    xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+    /*Load output scale values*/
+    VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int8_t *pOut  = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2);
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        if (inputFlag) /* Bias Values */
+        {
+          phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+          ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+        }
+        else  /* Accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH;
+          xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH;
+          xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH;
+          xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH;
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          valign vaAcc = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL);
+          IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL);
+          IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL);
+          IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL);
+          IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL);
+        }
+
+        /* Input Data and Coeff Data Pointers */
+        uint8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        xb_vecN_2x32v hvecInAddrOff    = 0;
+        xb_vecN_2x32v hvecCoeffAddrOff = 0;
+        xb_vecN_2x32v hvecLaneIdx      = 0;
+        int32_t inAddrOff, coeffAddrOff;
+
+        for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */
+        {
+          /* Condition checks performed to get the Input and Coefficient        */
+          /* Pointer Offsets after combining the Kernel Width and Height Loops  */
+          vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU);
+          /* hvecLaneIdx will be reset to zero after every kWidth */
+          hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2);
+          /* InPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2);
+          /* CoeffPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2);
+          /* Extracting Input and Coefficient address offsets */
+          inAddrOff        = IVP_EXTRN_2X32(hvecInAddrOff, 0);
+          coeffAddrOff     = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0);
+          hvecLaneIdx      = IVP_ADDN_2X32(hvecLaneIdx, 1);
+          hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2);
+          hvecInAddrOff    = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX);
+
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8U *) (pData + inAddrOff);
+          pdvecData2 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * numX);
+          pdvecData3 = (xb_vec2Nx8U *) (pData + inAddrOff + strideY * inDataPitch2 * numY);
+          pdvecData4 = (xb_vec2Nx8U *) (pData + inAddrOff + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff);
+
+          /* Primes registers for Aligning Load */
+          valign vaData1 = IVP_LA2NX8U_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8U_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8U_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8U_PP(pdvecData4);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */
+          {
+            xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, 4);
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0);
+#else
+            xb_vec2Nx8 dvecData1;
+            xb_vec2Nx8 dvecData2;
+            xb_vec2Nx8 dvecData3;
+            xb_vec2Nx8 dvecData4;
+
+            dvecData1 = IVP_SUB2NX8U(dvecInp1, 128);
+            dvecData2 = IVP_SUB2NX8U(dvecInp2, 128);
+            dvecData3 = IVP_SUB2NX8U(dvecInp3, 128);
+            dvecData4 = IVP_SUB2NX8U(dvecInp4, 128);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+#endif
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#endif
+          } /* End Input Channels */
+
+          /* Corner Case Handling if number of input channels not multiple of 4 */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh = numInCh - inCh;
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, remInCh);
+            xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, remInCh);
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0);
+#else
+            xb_vec2Nx8 dvecData1 = 0;
+            xb_vec2Nx8 dvecData2 = 0;
+            xb_vec2Nx8 dvecData3 = 0;
+            xb_vec2Nx8 dvecData4 = 0;
+
+            IVP_SUB2NX8UT(dvecData1, dvecInp1, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh));
+            IVP_SUB2NX8UT(dvecData2, dvecInp2, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh));
+            IVP_SUB2NX8UT(dvecData3, dvecInp3, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh));
+            IVP_SUB2NX8UT(dvecData4, dvecInp4, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh));
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+#endif
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Coefficient Loads */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#endif
+          } /* End Corner case handling */
+        }   /* End Kernel Height * Width */
+
+        if (outputFlag)  /* Store to ouput Tile*/
+        {
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV_PARTIAL
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Store the output dvecOut1 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut2 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut3 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut4 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+        else /* Store to accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1);
+          xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1);
+          xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1);
+          xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1);
+
+          xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2);
+          xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2);
+          xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2);
+          xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2);
+
+          xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3);
+          xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3);
+          xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3);
+          xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3);
+
+          xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4);
+          xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4);
+          xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4);
+          xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4);
+
+
+          /* Store the hvecAcc1 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc2 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX);
+          IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX);
+          IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc3 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY);
+          IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY);
+          IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the  hvecAcc4 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY));
+          IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY);
+          IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+        }
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D partial convolution      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_MOD_DWH_QM32_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_S8S8IXCa2_MOD_DWH_QM32_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t dilationX     = 1;
+  const uint8_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t inputFlag     = XAI_CNN_CONV_GET_FLAG_INPUT(param);
+  const uint8_t outputFlag    = XAI_CNN_CONV_GET_FLAG_OUTPUT(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  int32_t * pAccData = NULL;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile);
+  }
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Pitch of AccTile Data (DWH) in dim1 and dim2 */
+  int32_t accDataPitch1 = 0;
+  int32_t accDataPitch2 = 0;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile);
+    accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile);
+  }
+
+  int32_t dilatedKWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1;
+  int32_t leftEdge, topEdge;
+  if ((dilatedKWidthU % 2) != 0)
+  {
+    leftEdge = dilatedKWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1);
+  }
+
+  if ((dilatedKHeightU % 2) != 0)
+  {
+    topEdge = dilatedKHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k, j;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecData1;
+  xb_vec2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecData3;
+  xb_vec2Nx8* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vecN_2x32v* restrict phvecAcc;
+
+  xb_vecN_2x32v hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH;
+  xb_vecN_2x32v hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH;
+  xb_vecN_2x32v hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH;
+  xb_vecN_2x32v hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV_PARTIAL
+    xb_vecNx16U outScaleDataL, outScaleDataH;
+    /*Load output scale values*/
+    valign vaScale = IVP_LANX16U_PP(pOutScaleData);
+    IVP_LAVNX16_XP(outScaleDataL, vaScale, pOutScaleData, 2 * remainingOutCh);
+    IVP_LAVNX16_XP(outScaleDataH, vaScale, pOutScaleData, 2 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int8_t *pOut  = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2);
+
+        /* Initialize accumulators */
+        hvecSum1LL = hvecSum1LH = hvecSum1HL = hvecSum1HH = 0;
+        hvecSum2LL = hvecSum2LH = hvecSum2HL = hvecSum2HH = 0;
+        hvecSum3LL = hvecSum3LH = hvecSum3HL = hvecSum3HH = 0;
+        hvecSum4LL = hvecSum4LH = hvecSum4HL = hvecSum4HH = 0;
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * numX);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideY * inDataPitch2 * numY);
+          pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8_PP(pdvecData4);
+
+          /* (Input Channels * kWidth) loops combined */
+          for (j = 0; j < kWidthU * numInCh; j += 508) /* Emulation: To avoid 24 bit overflow 2^23-1 / 128 / 128 = 511.99 */
+          {
+            xb_vec2Nx24 daccSum1 = 0, daccSum2 = 0, daccSum3 = 0, daccSum4 = 0;
+            int32_t numIter      = XT_MIN(508, kWidthU * numInCh - j);
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+            for (k = 0; k < numIter; k += 4)
+            {
+              /* Aligning variable vector load of pixels */
+              xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+              xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+              xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+              xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4);
+
+              /* Extracting first 4 bytes of vector into address register */
+              /* Scalar integers to be used for QMUL                      */
+              int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+              int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+              int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+              int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+              /* Aligned Vector Loads of coefficients */
+              xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+              xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+              xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+              xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+
+              IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+              IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+              IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+              IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+            }   /* End for (k = 0; k < row; k += 4) */
+
+            hvecSum1LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum1), hvecSum1LL);
+            hvecSum1LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum1), hvecSum1LH);
+            hvecSum1HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum1), hvecSum1HL);
+            hvecSum1HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum1), hvecSum1HH);
+
+            hvecSum2LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum2), hvecSum2LL);
+            hvecSum2LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum2), hvecSum2LH);
+            hvecSum2HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum2), hvecSum2HL);
+            hvecSum2HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum2), hvecSum2HH);
+
+            hvecSum3LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum3), hvecSum3LL);
+            hvecSum3LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum3), hvecSum3LH);
+            hvecSum3HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum3), hvecSum3HL);
+            hvecSum3HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum3), hvecSum3HH);
+
+            hvecSum4LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum4), hvecSum4LL);
+            hvecSum4LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum4), hvecSum4LH);
+            hvecSum4HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum4), hvecSum4HL);
+            hvecSum4HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum4), hvecSum4HH);
+          } /* End Kernel Height * Width */
+        }   /* End for (k = 0; k < row; k += 4)*/
+
+        if (inputFlag) /* Bias Values */
+        {
+          phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+          xb_vecN_2x32v hvecBiasLL, hvecBiasLH, hvecBiasHL, hvecBiasHH;
+          valign vaBias = IVP_LAN_2X32_PP(phvecBias);
+          IVP_LAVN_2X32_XP(hvecBiasLL, vaBias, phvecBias, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecBiasLH, vaBias, phvecBias, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecBiasHL, vaBias, phvecBias, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecBiasHH, vaBias, phvecBias, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+
+          hvecSum1LL = IVP_ADDN_2X32(hvecSum1LL, hvecBiasLL);
+          hvecSum1LH = IVP_ADDN_2X32(hvecSum1LH, hvecBiasLH);
+          hvecSum1HL = IVP_ADDN_2X32(hvecSum1HL, hvecBiasHL);
+          hvecSum1HH = IVP_ADDN_2X32(hvecSum1HH, hvecBiasHH);
+
+          hvecSum2LL = IVP_ADDN_2X32(hvecSum2LL, hvecBiasLL);
+          hvecSum2LH = IVP_ADDN_2X32(hvecSum2LH, hvecBiasLH);
+          hvecSum2HL = IVP_ADDN_2X32(hvecSum2HL, hvecBiasHL);
+          hvecSum2HH = IVP_ADDN_2X32(hvecSum2HH, hvecBiasHH);
+
+          hvecSum3LL = IVP_ADDN_2X32(hvecSum3LL, hvecBiasLL);
+          hvecSum3LH = IVP_ADDN_2X32(hvecSum3LH, hvecBiasLH);
+          hvecSum3HL = IVP_ADDN_2X32(hvecSum3HL, hvecBiasHL);
+          hvecSum3HH = IVP_ADDN_2X32(hvecSum3HH, hvecBiasHH);
+
+          hvecSum4LL = IVP_ADDN_2X32(hvecSum4LL, hvecBiasLL);
+          hvecSum4LH = IVP_ADDN_2X32(hvecSum4LH, hvecBiasLH);
+          hvecSum4HL = IVP_ADDN_2X32(hvecSum4HL, hvecBiasHL);
+          hvecSum4HH = IVP_ADDN_2X32(hvecSum4HH, hvecBiasHH);
+        }
+        else  /* Accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH;
+          xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH;
+          xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH;
+          xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH;
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          valign vaAcc = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+
+          hvecSum1LL = IVP_ADDN_2X32(hvecSum1LL, hvecAcc1LL);
+          hvecSum1LH = IVP_ADDN_2X32(hvecSum1LH, hvecAcc1LH);
+          hvecSum1HL = IVP_ADDN_2X32(hvecSum1HL, hvecAcc1HL);
+          hvecSum1HH = IVP_ADDN_2X32(hvecSum1HH, hvecAcc1HH);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+
+          hvecSum2LL = IVP_ADDN_2X32(hvecSum2LL, hvecAcc2LL);
+          hvecSum2LH = IVP_ADDN_2X32(hvecSum2LH, hvecAcc2LH);
+          hvecSum2HL = IVP_ADDN_2X32(hvecSum2HL, hvecAcc2HL);
+          hvecSum2HH = IVP_ADDN_2X32(hvecSum2HH, hvecAcc2HH);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          hvecSum3LL = IVP_ADDN_2X32(hvecSum3LL, hvecAcc3LL);
+          hvecSum3LH = IVP_ADDN_2X32(hvecSum3LH, hvecAcc3LH);
+          hvecSum3HL = IVP_ADDN_2X32(hvecSum3HL, hvecAcc3HL);
+          hvecSum3HH = IVP_ADDN_2X32(hvecSum3HH, hvecAcc3HH);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          hvecSum4LL = IVP_ADDN_2X32(hvecSum4LL, hvecAcc4LL);
+          hvecSum4LH = IVP_ADDN_2X32(hvecSum4LH, hvecAcc4LH);
+          hvecSum4HL = IVP_ADDN_2X32(hvecSum4HL, hvecAcc4HL);
+          hvecSum4HH = IVP_ADDN_2X32(hvecSum4HH, hvecAcc4HH);
+        }
+
+
+        if (outputFlag)  /* Store to ouput Tile*/
+        {
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut1L, dvecOut1H, hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH, \
+                                                packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut2L, dvecOut2H, hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH, \
+                                                packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut3L, dvecOut3H, hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH, \
+                                                packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut4L, dvecOut4H, hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH, \
+                                                packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut1L, dvecOut1H, hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH, \
+                                             packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut2L, dvecOut2H, hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH, \
+                                             packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut3L, dvecOut3H, hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH, \
+                                             packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut4L, dvecOut4H, hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH, \
+                                             packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Store the output dvecOut1 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut2 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut3 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut4 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         remainingOutCh * numX * numY);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+        else /* Store to accumulator tile*/
+        {
+          /* Store the hvecAcc1 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          IVP_SAVN_2X32_XP(hvecSum1LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecSum1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc2 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX);
+          IVP_SAVN_2X32_XP(hvecSum2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX);
+          IVP_SAVN_2X32_XP(hvecSum2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc3 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY);
+          IVP_SAVN_2X32_XP(hvecSum3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY);
+          IVP_SAVN_2X32_XP(hvecSum3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the  hvecAcc4 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY));
+          IVP_SAVN_2X32_XP(hvecSum4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY);
+          IVP_SAVN_2X32_XP(hvecSum4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+        }
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D partial convolution      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_MOD_DWH_QM32_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_S8S8IXCa2_MOD_DWH_QM32_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t dilationX     = 1;
+  const uint8_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t inputFlag     = XAI_CNN_CONV_GET_FLAG_INPUT(param);
+  const uint8_t outputFlag    = XAI_CNN_CONV_GET_FLAG_OUTPUT(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  int32_t * pAccData = NULL;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile);
+  }
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Pitch of AccTile Data (DWH) in dim1 and dim2 */
+  int32_t accDataPitch1 = 0;
+  int32_t accDataPitch2 = 0;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile);
+    accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile);
+  }
+
+  int32_t dilatedKWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1;
+  int32_t leftEdge, topEdge;
+  if ((dilatedKWidthU % 2) != 0)
+  {
+    leftEdge = dilatedKWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1);
+  }
+
+  if ((dilatedKHeightU % 2) != 0)
+  {
+    topEdge = dilatedKHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k, j;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecData1;
+  xb_vec2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecData3;
+  xb_vec2Nx8* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vecN_2x32v* restrict phvecAcc;
+
+  xb_vecN_2x32v hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH;
+  xb_vecN_2x32v hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH;
+  xb_vecN_2x32v hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH;
+  xb_vecN_2x32v hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV_PARTIAL
+    xb_vecNx16U outScaleDataL, outScaleDataH;
+    /*Load output scale values*/
+    valign vaScale = IVP_LANX16U_PP(pOutScaleData);
+    IVP_LAVNX16_XP(outScaleDataL, vaScale, pOutScaleData, 2 * remainingOutCh);
+    IVP_LAVNX16_XP(outScaleDataH, vaScale, pOutScaleData, 2 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int8_t *pOut  = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2);
+
+        /* Initialize accumulators */
+        hvecSum1LL = hvecSum1LH = hvecSum1HL = hvecSum1HH = 0;
+        hvecSum2LL = hvecSum2LH = hvecSum2HL = hvecSum2HH = 0;
+        hvecSum3LL = hvecSum3LH = hvecSum3HL = hvecSum3HH = 0;
+        hvecSum4LL = hvecSum4LH = hvecSum4HL = hvecSum4HH = 0;
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * numX);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideY * inDataPitch2 * numY);
+          pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8_PP(pdvecData4);
+
+          for (j = 0; j < kWidthU * numInCh; j += 508)
+          {
+            xb_vec2Nx24 daccSum1 = 0, daccSum2 = 0, daccSum3 = 0, daccSum4 = 0;
+            int32_t numIter      = XT_MIN(508, kWidthU * numInCh - j);
+            for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */
+            {
+              /* Aligning variable vector load of pixels */
+              xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+              xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+              xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+              xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4);
+
+              /* Extracting first 4 bytes of vector into address register */
+              /* Scalar integers to be used for QMUL                      */
+              int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+              int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+              int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+              int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+              /* Aligned Vector Loads of coefficients */
+              xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+              xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+              xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+              xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+
+              IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+              IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+              IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+              IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+            }   /* End Input Channels */
+            /* Corner case handling as numIter is not a multiple of 4 */
+            if (k < numIter)
+            {
+              int32_t remInCh = numIter - k;
+
+              /* Aligning variable vector load of pixels */
+              xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh);
+              xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh);
+              xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh);
+              xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh);
+
+              /* Extracting first 4 bytes of vector into address register */
+              /* Scalar integers to be used for QMUL                      */
+              int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+              int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+              int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+              int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+              /* For conditional coefficient loads */
+              int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+              int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+              /* Aligned Vector Loads of coefficients */
+              xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+              xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+              xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+
+              IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+              IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+              IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+              IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+            }   /* End if( k < numIter)*/
+
+            hvecSum1LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum1), hvecSum1LL);
+            hvecSum1LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum1), hvecSum1LH);
+            hvecSum1HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum1), hvecSum1HL);
+            hvecSum1HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum1), hvecSum1HH);
+
+            hvecSum2LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum2), hvecSum2LL);
+            hvecSum2LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum2), hvecSum2LH);
+            hvecSum2HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum2), hvecSum2HL);
+            hvecSum2HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum2), hvecSum2HH);
+
+            hvecSum3LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum3), hvecSum3LL);
+            hvecSum3LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum3), hvecSum3LH);
+            hvecSum3HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum3), hvecSum3HL);
+            hvecSum3HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum3), hvecSum3HH);
+
+            hvecSum4LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum4), hvecSum4LL);
+            hvecSum4LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum4), hvecSum4LH);
+            hvecSum4HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum4), hvecSum4HL);
+            hvecSum4HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum4), hvecSum4HH);
+          }
+        } /* End Kernel Height * Width */
+        if (inputFlag) /* Bias Values */
+        {
+          phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+          xb_vecN_2x32v hvecBiasLL, hvecBiasLH, hvecBiasHL, hvecBiasHH;
+          valign vaBias = IVP_LAN_2X32_PP(phvecBias);
+          IVP_LAVN_2X32_XP(hvecBiasLL, vaBias, phvecBias, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecBiasLH, vaBias, phvecBias, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecBiasHL, vaBias, phvecBias, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecBiasHH, vaBias, phvecBias, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+
+          hvecSum1LL = IVP_ADDN_2X32(hvecSum1LL, hvecBiasLL);
+          hvecSum1LH = IVP_ADDN_2X32(hvecSum1LH, hvecBiasLH);
+          hvecSum1HL = IVP_ADDN_2X32(hvecSum1HL, hvecBiasHL);
+          hvecSum1HH = IVP_ADDN_2X32(hvecSum1HH, hvecBiasHH);
+
+          hvecSum2LL = IVP_ADDN_2X32(hvecSum2LL, hvecBiasLL);
+          hvecSum2LH = IVP_ADDN_2X32(hvecSum2LH, hvecBiasLH);
+          hvecSum2HL = IVP_ADDN_2X32(hvecSum2HL, hvecBiasHL);
+          hvecSum2HH = IVP_ADDN_2X32(hvecSum2HH, hvecBiasHH);
+
+          hvecSum3LL = IVP_ADDN_2X32(hvecSum3LL, hvecBiasLL);
+          hvecSum3LH = IVP_ADDN_2X32(hvecSum3LH, hvecBiasLH);
+          hvecSum3HL = IVP_ADDN_2X32(hvecSum3HL, hvecBiasHL);
+          hvecSum3HH = IVP_ADDN_2X32(hvecSum3HH, hvecBiasHH);
+
+          hvecSum4LL = IVP_ADDN_2X32(hvecSum4LL, hvecBiasLL);
+          hvecSum4LH = IVP_ADDN_2X32(hvecSum4LH, hvecBiasLH);
+          hvecSum4HL = IVP_ADDN_2X32(hvecSum4HL, hvecBiasHL);
+          hvecSum4HH = IVP_ADDN_2X32(hvecSum4HH, hvecBiasHH);
+        }
+        else  /* Accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH;
+          xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH;
+          xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH;
+          xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH;
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          valign vaAcc = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+
+          hvecSum1LL = IVP_ADDN_2X32(hvecSum1LL, hvecAcc1LL);
+          hvecSum1LH = IVP_ADDN_2X32(hvecSum1LH, hvecAcc1LH);
+          hvecSum1HL = IVP_ADDN_2X32(hvecSum1HL, hvecAcc1HL);
+          hvecSum1HH = IVP_ADDN_2X32(hvecSum1HH, hvecAcc1HH);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+
+          hvecSum2LL = IVP_ADDN_2X32(hvecSum2LL, hvecAcc2LL);
+          hvecSum2LH = IVP_ADDN_2X32(hvecSum2LH, hvecAcc2LH);
+          hvecSum2HL = IVP_ADDN_2X32(hvecSum2HL, hvecAcc2HL);
+          hvecSum2HH = IVP_ADDN_2X32(hvecSum2HH, hvecAcc2HH);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          hvecSum3LL = IVP_ADDN_2X32(hvecSum3LL, hvecAcc3LL);
+          hvecSum3LH = IVP_ADDN_2X32(hvecSum3LH, hvecAcc3LH);
+          hvecSum3HL = IVP_ADDN_2X32(hvecSum3HL, hvecAcc3HL);
+          hvecSum3HH = IVP_ADDN_2X32(hvecSum3HH, hvecAcc3HH);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          hvecSum4LL = IVP_ADDN_2X32(hvecSum4LL, hvecAcc4LL);
+          hvecSum4LH = IVP_ADDN_2X32(hvecSum4LH, hvecAcc4LH);
+          hvecSum4HL = IVP_ADDN_2X32(hvecSum4HL, hvecAcc4HL);
+          hvecSum4HH = IVP_ADDN_2X32(hvecSum4HH, hvecAcc4HH);
+        }
+
+        if (outputFlag)  /* Store to ouput Tile*/
+        {
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV_PARTIAL
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut1L, dvecOut1H, hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH, \
+                                                packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut2L, dvecOut2H, hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH, \
+                                                packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut3L, dvecOut3H, hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH, \
+                                                packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut4L, dvecOut4H, hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH, \
+                                                packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut1L, dvecOut1H, hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH, \
+                                             packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut2L, dvecOut2H, hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH, \
+                                             packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut3L, dvecOut3H, hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH, \
+                                             packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut4L, dvecOut4H, hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH, \
+                                             packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Store the output dvecOut1 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut2 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut3 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut4 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         remainingOutCh * numX * numY);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+        else /* Store to accumulator tile*/
+        {
+          /* Store the hvecAcc1 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          IVP_SAVN_2X32_XP(hvecSum1LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecSum1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc2 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX);
+          IVP_SAVN_2X32_XP(hvecSum2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX);
+          IVP_SAVN_2X32_XP(hvecSum2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc3 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY);
+          IVP_SAVN_2X32_XP(hvecSum3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY);
+          IVP_SAVN_2X32_XP(hvecSum3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the  hvecAcc4 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY));
+          IVP_SAVN_2X32_XP(hvecSum4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY);
+          IVP_SAVN_2X32_XP(hvecSum4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+        }
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D partial convolution      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_QM32(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void partialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH_QM32(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW      = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH      = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh   = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh  = XAI_TILE3D_GET_DIM1(outTile);
+  const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU   = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU  = XAI_TILE4D_GET_DIM4(coeffTile);
+  int32_t dilatedkWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedkHeightU = dilationY * (kHeightU - 1) + 1;
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t inputFlag     = XAI_CNN_CONV_GET_FLAG_INPUT(param);
+  const uint8_t outputFlag    = XAI_CNN_CONV_GET_FLAG_OUTPUT(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  int32_t * pAccData = NULL;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile);
+  }
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Pitch of AccTile Data (DWH) in dim1 and dim2 */
+  int32_t accDataPitch1 = 0;
+  int32_t accDataPitch2 = 0;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile);
+    accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile);
+  }
+
+  int32_t leftEdge, topEdge;
+  if ((dilatedkWidthU % 2) != 0)
+  {
+    leftEdge = dilatedkWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedkWidthU / 2) : ((dilatedkWidthU / 2) - 1);
+  }
+
+  if ((dilatedkHeightU % 2) != 0)
+  {
+    topEdge = dilatedkHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedkHeightU / 2) : ((dilatedkHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, k, j;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecData1;
+  xb_vec2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecData3;
+  xb_vec2Nx8* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vecN_2x32v* restrict phvecAcc;
+
+  xb_vecN_2x32v hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH;
+  xb_vecN_2x32v hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH;
+  xb_vecN_2x32v hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH;
+  xb_vecN_2x32v hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH;
+  /* Loops Start */
+  for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+  { /* walk across the kernels */
+    /* To handle corner case when number of output channels
+     * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+    int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV_PARTIAL
+    xb_vecNx16U outScaleDataL, outScaleDataH;
+    /*Load output scale values*/
+    valign vaScale = IVP_LANX16U_PP(pOutScaleData);
+    IVP_LAVNX16_XP(outScaleDataL, vaScale, pOutScaleData, 2 * remainingOutCh);
+    IVP_LAVNX16_XP(outScaleDataH, vaScale, pOutScaleData, 2 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+#endif
+    for (y = 0; y < outH; y += 2) /* Image Height */
+    {                             /* walk down the rows */
+      /* Variable to handle corner case when height is odd */
+      int32_t numY = XT_MIN(1, outH - y - 1);
+      for (x = 0; x < outW; x += 2) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t numX = XT_MIN(1, outW - x - 1);
+
+        /* Output Data pointer */
+        int8_t *pOut  = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2);
+
+        /* Initialize accumulators */
+        hvecSum1LL = hvecSum1LH = hvecSum1HL = hvecSum1HH = 0;
+        hvecSum2LL = hvecSum2LH = hvecSum2HL = hvecSum2HH = 0;
+        hvecSum3LL = hvecSum3LH = hvecSum3HL = hvecSum3HH = 0;
+        hvecSum4LL = hvecSum4LH = hvecSum4HL = hvecSum4HH = 0;
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        xb_vecN_2x32v hvecInAddrOff    = 0;
+        xb_vecN_2x32v hvecCoeffAddrOff = 0;
+        xb_vecN_2x32v hvecLaneIdx      = 0;
+        int32_t inAddrOff, coeffAddrOff;
+
+        for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */
+        {
+          /* Condition checks performed to get the Input and Coefficient        */
+          /* Pointer Offsets after combining the Kernel Width and Height Loops  */
+          vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU);
+          /* hvecLaneIdx will be reset to zero after every kWidth */
+          hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2);
+          /* InPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2);
+          /* CoeffPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2);
+          /* Extracting Input and Coefficient address offsets */
+          inAddrOff        = IVP_EXTRN_2X32(hvecInAddrOff, 0);
+          coeffAddrOff     = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0);
+          hvecLaneIdx      = IVP_ADDN_2X32(hvecLaneIdx, 1);
+          hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2);
+          hvecInAddrOff    = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX);
+
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + inAddrOff);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * numX);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + inAddrOff + strideY * inDataPitch2 * numY);
+          pdvecData4 = (xb_vec2Nx8 *) (pData + inAddrOff + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff);
+
+          /* Primes registers for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8_PP(pdvecData4);
+
+          for (j = 0; j < numInCh; j += 508) /* Emulation: To avoid 24 bit overflow 2^23-1 / 128 / 128 = 511.99 */
+          {
+            xb_vec2Nx24 daccSum1 = 0, daccSum2 = 0, daccSum3 = 0, daccSum4 = 0;
+            int32_t numIter      = XT_MIN(508, numInCh - j);
+            for (inCh = 0; inCh < numIter - 3; inCh += 4)
+            {
+              xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+              xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+              xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+              xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4);
+
+              /* Extracting first 4 bytes of vector into address register */
+              /* Scalar integers to be used for QMUL                      */
+              int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+              int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+              int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+              int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+              /* Aligned Vector Loads of coefficients */
+              xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+              xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+              xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+              xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+              IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+              IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+              IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+              IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+            } /* End for (inCh = 0; inCh < row - 3; inCh += 4) */
+
+            /* Corner Case Handling if number of input channels not multiple of 4 */
+            if (inCh < numIter)
+            {
+              int32_t remInCh = numIter - inCh;
+
+              /* Aligning variable vector load of pixels */
+              xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh);
+              xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh);
+              xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh);
+              xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh);
+
+              /* Extracting first 4 bytes of vector into address register */
+              /* Scalar integers to be used for QMUL                      */
+              int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+              int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+              int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+              int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                     (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+              /* For conditional coefficient loads */
+              int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+              int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+              /* Coefficient Loads */
+              xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+              xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+              xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+              IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+              IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+              IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+              IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+            } /* End Corner case handling */
+
+            hvecSum1LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum1), hvecSum1LL);
+            hvecSum1LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum1), hvecSum1LH);
+            hvecSum1HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum1), hvecSum1HL);
+            hvecSum1HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum1), hvecSum1HH);
+
+            hvecSum2LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum2), hvecSum2LL);
+            hvecSum2LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum2), hvecSum2LH);
+            hvecSum2HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum2), hvecSum2HL);
+            hvecSum2HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum2), hvecSum2HH);
+
+            hvecSum3LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum3), hvecSum3LL);
+            hvecSum3LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum3), hvecSum3LH);
+            hvecSum3HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum3), hvecSum3HL);
+            hvecSum3HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum3), hvecSum3HH);
+
+            hvecSum4LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum4), hvecSum4LL);
+            hvecSum4LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum4), hvecSum4LH);
+            hvecSum4HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum4), hvecSum4HL);
+            hvecSum4HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum4), hvecSum4HH);
+          }  /* End for(j = 0; j < numInCh; j += 508)*/
+        }   /* End Kernel Height * Width */
+
+        if (inputFlag) /* Bias Values */
+        {
+          phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+          xb_vecN_2x32v hvecBiasLL, hvecBiasLH, hvecBiasHL, hvecBiasHH;
+          valign vaBias = IVP_LAN_2X32_PP(phvecBias);
+          IVP_LAVN_2X32_XP(hvecBiasLL, vaBias, phvecBias, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecBiasLH, vaBias, phvecBias, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecBiasHL, vaBias, phvecBias, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecBiasHH, vaBias, phvecBias, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+
+          hvecSum1LL = IVP_ADDN_2X32(hvecSum1LL, hvecBiasLL);
+          hvecSum1LH = IVP_ADDN_2X32(hvecSum1LH, hvecBiasLH);
+          hvecSum1HL = IVP_ADDN_2X32(hvecSum1HL, hvecBiasHL);
+          hvecSum1HH = IVP_ADDN_2X32(hvecSum1HH, hvecBiasHH);
+
+          hvecSum2LL = IVP_ADDN_2X32(hvecSum2LL, hvecBiasLL);
+          hvecSum2LH = IVP_ADDN_2X32(hvecSum2LH, hvecBiasLH);
+          hvecSum2HL = IVP_ADDN_2X32(hvecSum2HL, hvecBiasHL);
+          hvecSum2HH = IVP_ADDN_2X32(hvecSum2HH, hvecBiasHH);
+
+          hvecSum3LL = IVP_ADDN_2X32(hvecSum3LL, hvecBiasLL);
+          hvecSum3LH = IVP_ADDN_2X32(hvecSum3LH, hvecBiasLH);
+          hvecSum3HL = IVP_ADDN_2X32(hvecSum3HL, hvecBiasHL);
+          hvecSum3HH = IVP_ADDN_2X32(hvecSum3HH, hvecBiasHH);
+
+          hvecSum4LL = IVP_ADDN_2X32(hvecSum4LL, hvecBiasLL);
+          hvecSum4LH = IVP_ADDN_2X32(hvecSum4LH, hvecBiasLH);
+          hvecSum4HL = IVP_ADDN_2X32(hvecSum4HL, hvecBiasHL);
+          hvecSum4HH = IVP_ADDN_2X32(hvecSum4HH, hvecBiasHH);
+        }
+        else  /* Accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH;
+          xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH;
+          xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH;
+          xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH;
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          valign vaAcc = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+
+          hvecSum1LL = IVP_ADDN_2X32(hvecSum1LL, hvecAcc1LL);
+          hvecSum1LH = IVP_ADDN_2X32(hvecSum1LH, hvecAcc1LH);
+          hvecSum1HL = IVP_ADDN_2X32(hvecSum1HL, hvecAcc1HL);
+          hvecSum1HH = IVP_ADDN_2X32(hvecSum1HH, hvecAcc1HH);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+
+          hvecSum2LL = IVP_ADDN_2X32(hvecSum2LL, hvecAcc2LL);
+          hvecSum2LH = IVP_ADDN_2X32(hvecSum2LH, hvecAcc2LH);
+          hvecSum2HL = IVP_ADDN_2X32(hvecSum2HL, hvecAcc2HL);
+          hvecSum2HH = IVP_ADDN_2X32(hvecSum2HH, hvecAcc2HH);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          hvecSum3LL = IVP_ADDN_2X32(hvecSum3LL, hvecAcc3LL);
+          hvecSum3LH = IVP_ADDN_2X32(hvecSum3LH, hvecAcc3LH);
+          hvecSum3HL = IVP_ADDN_2X32(hvecSum3HL, hvecAcc3HL);
+          hvecSum3HH = IVP_ADDN_2X32(hvecSum3HH, hvecAcc3HH);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          hvecSum4LL = IVP_ADDN_2X32(hvecSum4LL, hvecAcc4LL);
+          hvecSum4LH = IVP_ADDN_2X32(hvecSum4LH, hvecAcc4LH);
+          hvecSum4HL = IVP_ADDN_2X32(hvecSum4HL, hvecAcc4HL);
+          hvecSum4HH = IVP_ADDN_2X32(hvecSum4HH, hvecAcc4HH);
+        }
+
+        if (outputFlag)  /* Store to ouput Tile*/
+        {
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV_PARTIAL
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut1L, dvecOut1H, hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH, \
+                                                packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut2L, dvecOut2H, hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH, \
+                                                packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut3L, dvecOut3H, hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH, \
+                                                packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut4L, dvecOut4H, hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH, \
+                                                packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut1L, dvecOut1H, hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH, \
+                                             packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut2L, dvecOut2H, hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH, \
+                                             packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut3L, dvecOut3H, hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH, \
+                                             packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut4L, dvecOut4H, hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH, \
+                                             packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Store the output dvecOut1 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut2 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut3 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut4 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+        else /* Store to accumulator tile*/
+        {
+          /* Store the hvecAcc1 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          IVP_SAVN_2X32_XP(hvecSum1LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecSum1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc2 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX);
+          IVP_SAVN_2X32_XP(hvecSum2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX);
+          IVP_SAVN_2X32_XP(hvecSum2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc3 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY);
+          IVP_SAVN_2X32_XP(hvecSum3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY);
+          IVP_SAVN_2X32_XP(hvecSum3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the  hvecAcc4 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY));
+          IVP_SAVN_2X32_XP(hvecSum4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY);
+          IVP_SAVN_2X32_XP(hvecSum4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecSum4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+        }
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/*****************************************************************************
+*  xaiPartialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH   \
+*  xaiPartialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for MxN MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate MxN MOD_DWH 3D partial */
+/*               dilated convolution function and MxN MOD_DWH 3D VQ partial */
+/*               dilated convolution function                               */
+/*               Stride values = 1, 2 and 4 are supported                   */
+/*               Implementation also supports dilation >= 1 for stride = 1  */
+/*               and dilation = 1 for stride = 2, 4                         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Accumulator Tile, Output Tile                              */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Accumulated value will be within 24bit range               */
+/****************************************************************************/
+#ifdef DILATED_VQ_CONV_PARTIAL
+XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_POINTER(biasArray);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE,   \
+                    "\nKernel height = %d and width = %d\nKernel width and height should be less than or equal to 64", \
+                    XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile));
+    XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param);
+    XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) &&                                      \
+                    ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG,                    \
+                    "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) == 1) ||                                                            \
+                    ((XAI_CNN_CONV_GET_DILATIONX(param) >= 1) &&                                                           \
+                     (XAI_CNN_CONV_GET_STRIDEX(param) == 1)), XAI_ERR_BADARG,                                              \
+                    "\nDilationX = %hhu\nDilationX should be 1. It can be greater than 1 only when strideX is equal to 1", \
+                    XAI_CNN_CONV_GET_DILATIONX(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONY(param) == 1) ||                                                            \
+                    ((XAI_CNN_CONV_GET_DILATIONY(param) >= 1) &&                                                           \
+                     (XAI_CNN_CONV_GET_STRIDEY(param) == 1)), XAI_ERR_BADARG,                                              \
+                    "\nDilationY = %hhu\nDilationY should be 1. It can be greater than 1 only when strideY is equal to 1", \
+                    XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV_PARTIAL
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE,                                                      \
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+
+    if (XAI_CNN_CONV_GET_FLAG_INPUT(param))
+    {
+      XAI_CHECK_ARRAY_S32(biasArray);
+    }
+    if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+    {
+      XAI_CHECK_TILE3D_S32(accTile);
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(accTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, accTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, accTile);
+      XAI_CHECK_TILE3D_DATA_ORDER(accTile, XAI_DWH);
+      XAI_CHECK_TILE3D_SIZE_EQ(accTile, outTile);
+    }
+    if (XAI_CNN_CONV_GET_FLAG_OUTPUT(param))
+    {
+      XAI_CHECK_TILE3D(outTile);
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+      if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param)))
+      {
+        XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(accTile, outTile);
+      }
+    }
+  }
+#ifndef DILATED_VQ_CONV_PARTIAL
+  if ((XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) && \
+      XAI_CNN_CONV_GET_FLAG_OUTPUT(param))
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+  /* Calling further optimized function if dilation = 1 and (no edges along depth or kernelWidth = 1)*/
+  if ((XAI_CNN_CONV_GET_DILATIONX(param) == 1) &&                            \
+      ((XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile)) || \
+       (XAI_TILE4D_GET_DIM3(coeffTile) == 1)))
+  {
+    if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0)
+    {
+#ifdef DILATED_VQ_CONV_PARTIAL
+      partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, \
+                                                                         coeffTile, biasArray, outputScaleArray, accTile, outTile, param);
+#else
+      partialConvolved3D_S_MxNd1_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, \
+                                                                       coeffTile, biasArray, accTile, outTile, param);
+#endif
+    }
+    else
+    {
+#ifdef DILATED_VQ_CONV_PARTIAL
+      partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \
+                                                                      coeffTile, biasArray, outputScaleArray, accTile, outTile, param);
+#else
+      partialConvolved3D_S_MxNd1_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \
+                                                                    coeffTile, biasArray, accTile, outTile, param);
+#endif
+    }
+  }
+  else
+  {
+#ifdef DILATED_VQ_CONV_PARTIAL
+    partialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH(inTile, \
+                                                 coeffTile, biasArray, outputScaleArray, accTile, outTile, param);
+#else
+    partialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH(inTile, \
+                                               coeffTile, biasArray, accTile, outTile, param);
+#endif
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************
+*  xaiPartialConvolved3D_S_MxN_U8S8IXCa2_MOD_DWH   \
+*  xaiPartialConvolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for MxN MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate MxN MOD_DWH 3D partial */
+/*               dilated convolution function and MxN MOD_DWH 3D VQ partial */
+/*               dilated convolution function                               */
+/*               Stride values = 1, 2 and 4 are supported                   */
+/*               Implementation also supports dilation >= 1 for stride = 1  */
+/*               and dilation = 1 for stride = 2, 4                         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Accumulator Tile, Output Tile                              */
+/* Assumptions : InData are U8, CoeffData are S8                            */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Accumulated value will be within 24bit range               */
+/****************************************************************************/
+#ifdef DILATED_VQ_CONV_PARTIAL
+XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_U8S8IXCa2_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_U8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_POINTER(biasArray);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE,   \
+                    "\nKernel height = %d and width = %d\nKernel width and height should be less than or equal to 64", \
+                    XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile));
+    XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param);
+    XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) &&                                      \
+                    ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG,                    \
+                    "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) == 1) ||                                                            \
+                    ((XAI_CNN_CONV_GET_DILATIONX(param) >= 1) &&                                                           \
+                     (XAI_CNN_CONV_GET_STRIDEX(param) == 1)), XAI_ERR_BADARG,                                              \
+                    "\nDilationX = %hhu\nDilationX should be 1. It can be greater than 1 only when strideX is equal to 1", \
+                    XAI_CNN_CONV_GET_DILATIONX(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONY(param) == 1) ||                                                            \
+                    ((XAI_CNN_CONV_GET_DILATIONY(param) >= 1) &&                                                           \
+                     (XAI_CNN_CONV_GET_STRIDEY(param) == 1)), XAI_ERR_BADARG,                                              \
+                    "\nDilationY = %hhu\nDilationY should be 1. It can be greater than 1 only when strideY is equal to 1", \
+                    XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV_PARTIAL
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE,                                                      \
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+
+    if (XAI_CNN_CONV_GET_FLAG_INPUT(param))
+    {
+      XAI_CHECK_ARRAY_S32(biasArray);
+    }
+    if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+    {
+      XAI_CHECK_TILE3D_S32(accTile);
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(accTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, accTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, accTile);
+      XAI_CHECK_TILE3D_DATA_ORDER(accTile, XAI_DWH);
+      XAI_CHECK_TILE3D_SIZE_EQ(accTile, outTile);
+    }
+    if (XAI_CNN_CONV_GET_FLAG_OUTPUT(param))
+    {
+      XAI_CHECK_TILE3D(outTile);
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+      if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param)))
+      {
+        XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(accTile, outTile);
+      }
+    }
+  }
+#ifndef DILATED_VQ_CONV_PARTIAL
+  if ((XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) && \
+      XAI_CNN_CONV_GET_FLAG_OUTPUT(param))
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+  /* Calling further optimized function if dilation = 1 and (no edges along depth or kernelWidth = 1)*/
+  if ((XAI_CNN_CONV_GET_DILATIONX(param) == 1) &&                            \
+      ((XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile)) || \
+       (XAI_TILE4D_GET_DIM3(coeffTile) == 1)))
+  {
+    if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0)
+    {
+#ifdef DILATED_VQ_CONV_PARTIAL
+      partialConvolvedVQ3D_S_MxNd1_U8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, \
+                                                                         coeffTile, biasArray, outputScaleArray, accTile, outTile, param);
+#else
+      partialConvolved3D_S_MxNd1_U8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, \
+                                                                       coeffTile, biasArray, accTile, outTile, param);
+#endif
+    }
+    else
+    {
+#ifdef DILATED_VQ_CONV_PARTIAL
+      partialConvolvedVQ3D_S_MxNd1_U8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \
+                                                                      coeffTile, biasArray, outputScaleArray, accTile, outTile, param);
+#else
+      partialConvolved3D_S_MxNd1_U8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \
+                                                                    coeffTile, biasArray, accTile, outTile, param);
+#endif
+    }
+  }
+  else
+  {
+#ifdef DILATED_VQ_CONV_PARTIAL
+    partialConvolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH(inTile, \
+                                                 coeffTile, biasArray, outputScaleArray, accTile, outTile, param);
+#else
+    partialConvolved3D_S_MxN_U8S8IXCa2_MOD_DWH(inTile, \
+                                               coeffTile, biasArray, accTile, outTile, param);
+#endif
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/**********partialConvolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH************/
+/**********partialConvolve3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH   ************/
+/* Description : P6 optimized implementation of 3D partial convolution      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is U8, CoeffData is S8                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile,
+                                                                                const xai_pTile4D coeffTile,
+                                                                                const xai_pArray biasArray,
+                                                                                const xai_pArray outputScaleArray,
+                                                                                xai_pTile3D accTile,
+                                                                                xai_pTile3D outTile,
+                                                                                const xai_cnn_conv_params *param
+                                                                                )
+#else
+static _XAI_INLINE_ void partialConvolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile,
+                                                                              const xai_pTile4D coeffTile,
+                                                                              const xai_pArray biasArray,
+                                                                              xai_pTile3D accTile,
+                                                                              xai_pTile3D outTile,
+                                                                              const xai_cnn_conv_params *param
+                                                                              )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW      = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH      = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh   = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh  = XAI_TILE3D_GET_DIM1(outTile);
+  const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU   = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU  = XAI_TILE4D_GET_DIM4(coeffTile);
+  int32_t dilatedkWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedkHeightU = dilationY * (kHeightU - 1) + 1;
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t inputFlag     = XAI_CNN_CONV_GET_FLAG_INPUT(param);
+  const uint8_t outputFlag    = XAI_CNN_CONV_GET_FLAG_OUTPUT(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  uint8_t *pInData   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  int32_t * pAccData = NULL;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile);
+  }
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Pitch of AccTile Data (DWH) in dim1 and dim2 */
+  int32_t accDataPitch1 = 0;
+  int32_t accDataPitch2 = 0;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile);
+    accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile);
+  }
+
+  int32_t leftEdge, topEdge;
+  if ((dilatedkWidthU % 2) != 0)
+  {
+    leftEdge = dilatedkWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedkWidthU / 2) : ((dilatedkWidthU / 2) - 1);
+  }
+
+  if ((dilatedkHeightU % 2) != 0)
+  {
+    topEdge = dilatedkHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedkHeightU / 2) : ((dilatedkHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8U* restrict pdvecData1;
+  xb_vec2Nx8U* restrict pdvecData2;
+  xb_vec2Nx8U* restrict pdvecData3;
+  xb_vec2Nx8U* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vecN_2x32v* restrict phvecAcc;
+
+  /* Loops Start */
+  for (y = 0; y < outH; y++) /* Image Height */
+  {
+    for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+    { /* walk across the kernels */
+      /* To handle corner case when number of output channels
+       * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+      int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV_PARTIAL
+      xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+      /*Load output scale values*/
+      xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) (pScale + outCh);
+      VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+      for (x = 0; x < outW; x += 4) /* Image Width */
+      {                             /* walk across the columns */
+        int32_t enable2ndWidth = XT_SALT(1, outW - x);
+        int32_t enable3rdWidth = XT_SALT(2, outW - x);
+        int32_t enable4thWidth = XT_SALT(3, outW - x);
+
+        /* Output Data pointer */
+        int8_t *pOut  = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2);
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        if (inputFlag) /* Bias Values */
+        {
+          phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+          ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+        }
+        else  /* Accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH;
+          xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH;
+          xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH;
+          xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH;
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          valign vaAcc = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL);
+          IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * enable2ndWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL);
+          IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 2 * enable3rdWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL);
+          IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 3 * enable4thWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL);
+          IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL);
+        }
+
+        /* Input Data and Coeff Data Pointers */
+        uint8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        xb_vecN_2x32v hvecInAddrOff    = 0;
+        xb_vecN_2x32v hvecCoeffAddrOff = 0;
+        xb_vecN_2x32v hvecLaneIdx      = 0;
+        int32_t inAddrOff, coeffAddrOff;
+
+        for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */
+        {
+          /* Condition checks performed to get the Input and Coefficient        */
+          /* Pointer Offsets after combining the Kernel Width and Height Loops  */
+          vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU);
+          /* hvecLaneIdx will be reset to zero after every kWidth */
+          hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2);
+          /* InPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2);
+          /* CoeffPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2);
+          /* Extracting Input and Coefficient address offsets */
+          inAddrOff        = IVP_EXTRN_2X32(hvecInAddrOff, 0);
+          coeffAddrOff     = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0);
+          hvecLaneIdx      = IVP_ADDN_2X32(hvecLaneIdx, 1);
+          hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2);
+          hvecInAddrOff    = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX);
+
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8U *) (pData + inAddrOff);
+          pdvecData2 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * enable2ndWidth);
+          pdvecData3 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * 2 * enable3rdWidth);
+          pdvecData4 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * 3 * enable4thWidth);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff);
+
+          /* Primes registers for Aligning Load */
+          valign vaData1 = IVP_LA2NX8U_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8U_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8U_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8U_PP(pdvecData4);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */
+          {
+            xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, 4);
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0);
+#else
+            xb_vec2Nx8 dvecData1;
+            xb_vec2Nx8 dvecData2;
+            xb_vec2Nx8 dvecData3;
+            xb_vec2Nx8 dvecData4;
+
+            dvecData1 = IVP_SUB2NX8U(dvecInp1, 128);
+            dvecData2 = IVP_SUB2NX8U(dvecInp2, 128);
+            dvecData3 = IVP_SUB2NX8U(dvecInp3, 128);
+            dvecData4 = IVP_SUB2NX8U(dvecInp4, 128);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+#endif
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#endif
+          } /* End Input Channels */
+
+          /* Corner Case Handling if number of input channels not multiple of 4 */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh = numInCh - inCh;
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, remInCh);
+            xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, remInCh);
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0);
+#else
+            xb_vec2Nx8 dvecData1 = 0;
+            xb_vec2Nx8 dvecData2 = 0;
+            xb_vec2Nx8 dvecData3 = 0;
+            xb_vec2Nx8 dvecData4 = 0;
+
+            IVP_SUB2NX8UT(dvecData1, dvecInp1, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh));
+            IVP_SUB2NX8UT(dvecData2, dvecInp2, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh));
+            IVP_SUB2NX8UT(dvecData3, dvecInp3, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh));
+            IVP_SUB2NX8UT(dvecData4, dvecInp4, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh));
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+#endif
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Coefficient Loads */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#endif
+          } /* End Corner case handling */
+        }   /* End Kernel Height * Width */
+
+        if (outputFlag)  /* Store to ouput Tile*/
+        {
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV_PARTIAL
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Store the output dvecOut1 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut2 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * enable2ndWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut3 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 2 * enable3rdWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut4 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 3 * enable4thWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable4thWidth);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable4thWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+        else /* Store to accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1);
+          xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1);
+          xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1);
+          xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1);
+
+          xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2);
+          xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2);
+          xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2);
+          xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2);
+
+          xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3);
+          xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3);
+          xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3);
+          xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3);
+
+          xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4);
+          xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4);
+          xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4);
+          xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4);
+
+
+          /* Store the hvecAcc1 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc2 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * enable2ndWidth);
+          IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc3 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 2 * enable3rdWidth);
+          IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the  hvecAcc4 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 3 * enable4thWidth);
+          IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+        }
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D partial convolution      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is U8, CoeffData is S8                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t dilationX     = 1;
+  const uint8_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t inputFlag     = XAI_CNN_CONV_GET_FLAG_INPUT(param);
+  const uint8_t outputFlag    = XAI_CNN_CONV_GET_FLAG_OUTPUT(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  uint8_t *pInData   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  int32_t * pAccData = NULL;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile);
+  }
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Pitch of AccTile Data (DWH) in dim1 and dim2 */
+  int32_t accDataPitch1 = 0;
+  int32_t accDataPitch2 = 0;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile);
+    accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile);
+  }
+
+  int32_t numIter = kWidthU * numInCh;
+
+  int32_t dilatedKWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1;
+  int32_t leftEdge, topEdge;
+  if ((dilatedKWidthU % 2) != 0)
+  {
+    leftEdge = dilatedKWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1);
+  }
+
+  if ((dilatedKHeightU % 2) != 0)
+  {
+    topEdge = dilatedKHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8U* restrict pdvecData1;
+  xb_vec2Nx8U* restrict pdvecData2;
+  xb_vec2Nx8U* restrict pdvecData3;
+  xb_vec2Nx8U* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vecN_2x32v* restrict phvecAcc;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (y = 0; y < outH; y++)  /* Image Height */
+  {                           /* walk down the rows */
+    for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+    { /* walk across the kernels */
+      /* To handle corner case when number of output channels
+       * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+      int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV_PARTIAL
+      xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+      /*Load output scale values*/
+      xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) (pScale + outCh);
+      VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+      for (x = 0; x < outW; x += 4) /* Image Width */
+      {                             /* walk across the columns */
+        int32_t enable2ndWidth = XT_SALT(1, outW - x);
+        int32_t enable3rdWidth = XT_SALT(2, outW - x);
+        int32_t enable4thWidth = XT_SALT(3, outW - x);
+        /* Output Data pointer */
+        int8_t *pOut  = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2);
+
+        /* Initialize accumulators */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        if (inputFlag) /* Bias Values */
+        {
+          phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+          ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+        }
+        else  /* Accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH;
+          xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH;
+          xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH;
+          xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH;
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          valign vaAcc = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL);
+          IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * enable2ndWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL);
+          IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 2 * enable3rdWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL);
+          IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 3 * enable4thWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL);
+          IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL);
+        }
+
+        /* Input Data and Coeff Data Pointers */
+        uint8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY);
+          pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * enable2ndWidth);
+          pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * 2 * enable3rdWidth);
+          pdvecData4 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * 3 * enable4thWidth);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8U_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8U_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8U_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8U_PP(pdvecData4);
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+          for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, 4);
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0);
+#else
+            xb_vec2Nx8 dvecData1;
+            xb_vec2Nx8 dvecData2;
+            xb_vec2Nx8 dvecData3;
+            xb_vec2Nx8 dvecData4;
+
+            dvecData1 = IVP_SUB2NX8U(dvecInp1, 128);
+            dvecData2 = IVP_SUB2NX8U(dvecInp2, 128);
+            dvecData3 = IVP_SUB2NX8U(dvecInp3, 128);
+            dvecData4 = IVP_SUB2NX8U(dvecInp4, 128);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+#endif
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#endif
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        if (outputFlag)  /* Store to ouput Tile*/
+        {
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV_PARTIAL
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Store the output dvecOut1 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut2 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * enable2ndWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut3 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 2 * enable3rdWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut4 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 3 * enable4thWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         remainingOutCh * enable4thWidth);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable4thWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+        else /* Store to accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1);
+          xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1);
+          xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1);
+          xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1);
+
+          xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2);
+          xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2);
+          xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2);
+          xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2);
+
+          xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3);
+          xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3);
+          xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3);
+          xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3);
+
+          xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4);
+          xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4);
+          xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4);
+          xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4);
+
+
+          /* Store the hvecAcc1 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc2 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * enable2ndWidth));
+          IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc3 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 2 * enable3rdWidth));
+          IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the  hvecAcc4 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 3 * enable4thWidth));
+          IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+        }
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D partial convolution      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData is U8, CoeffData is S8                              */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t dilationX     = 1;
+  const uint8_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t inputFlag     = XAI_CNN_CONV_GET_FLAG_INPUT(param);
+  const uint8_t outputFlag    = XAI_CNN_CONV_GET_FLAG_OUTPUT(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  uint8_t *pInData   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  int32_t * pAccData = NULL;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile);
+  }
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Pitch of AccTile Data (DWH) in dim1 and dim2 */
+  int32_t accDataPitch1 = 0;
+  int32_t accDataPitch2 = 0;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile);
+    accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile);
+  }
+
+  int32_t numIter = kWidthU * numInCh;
+
+  int32_t dilatedKWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1;
+  int32_t leftEdge, topEdge;
+  if ((dilatedKWidthU % 2) != 0)
+  {
+    leftEdge = dilatedKWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1);
+  }
+
+  if ((dilatedKHeightU % 2) != 0)
+  {
+    topEdge = dilatedKHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8U* restrict pdvecData1;
+  xb_vec2Nx8U* restrict pdvecData2;
+  xb_vec2Nx8U* restrict pdvecData3;
+  xb_vec2Nx8U* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vecN_2x32v* restrict phvecAcc;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (y = 0; y < outH; y++)  /* Image Height */
+  {                           /* walk down the rows */
+    for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+    { /* walk across the kernels */
+      /* To handle corner case when number of output channels
+       * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+      int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV_PARTIAL
+      xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+      /*Load output scale values*/
+      xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) (pScale + outCh);
+      VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+
+      for (x = 0; x < outW; x += 4) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t enable2ndWidth = XT_SALT(1, outW - x);
+        int32_t enable3rdWidth = XT_SALT(2, outW - x);
+        int32_t enable4thWidth = XT_SALT(3, outW - x);
+
+        /* Output Data pointer */
+        int8_t *pOut  = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2);
+
+        /* Initialize accumulators */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        if (inputFlag) /* Bias Values */
+        {
+          phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+          ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+        }
+        else  /* Accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH;
+          xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH;
+          xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH;
+          xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH;
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          valign vaAcc = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL);
+          IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * enable2ndWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL);
+          IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 2 * enable3rdWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL);
+          IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 3 * enable4thWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL);
+          IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL);
+        }
+
+        /* Input Data and Coeff Data Pointers */
+        uint8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY);
+          pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * enable2ndWidth);
+          pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * 2 * enable3rdWidth);
+          pdvecData4 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * 3 * enable4thWidth);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8U_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8U_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8U_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8U_PP(pdvecData4);
+
+          for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, 4);
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0);
+#else
+            xb_vec2Nx8 dvecData1;
+            xb_vec2Nx8 dvecData2;
+            xb_vec2Nx8 dvecData3;
+            xb_vec2Nx8 dvecData4;
+
+            dvecData1 = IVP_SUB2NX8U(dvecInp1, 128);
+            dvecData2 = IVP_SUB2NX8U(dvecInp2, 128);
+            dvecData3 = IVP_SUB2NX8U(dvecInp3, 128);
+            dvecData4 = IVP_SUB2NX8U(dvecInp4, 128);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+#endif
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#endif
+          }   /* End Input Channels */
+          /* Corner case handling as numIter is not a multiple of 4 */
+          if (k < numIter)
+          {
+            int32_t remInCh = numIter - k;
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, remInCh);
+            xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, remInCh);
+
+#ifdef IVP_MULSUQA2N8XR8
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0);
+#else
+            xb_vec2Nx8 dvecData1 = 0;
+            xb_vec2Nx8 dvecData2 = 0;
+            xb_vec2Nx8 dvecData3 = 0;
+            xb_vec2Nx8 dvecData4 = 0;
+
+            IVP_SUB2NX8UT(dvecData1, dvecInp1, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh));
+            IVP_SUB2NX8UT(dvecData2, dvecInp2, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh));
+            IVP_SUB2NX8UT(dvecData3, dvecInp3, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh));
+            IVP_SUB2NX8UT(dvecData4, dvecInp4, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh));
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+#endif
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+#ifdef IVP_MULSUQA2N8XR8
+            IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#else
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+#endif
+          } /* End Corner case handling */
+        }   /* End Kernel Height * Width */
+
+        if (outputFlag)  /* Store to ouput Tile*/
+        {
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV_PARTIAL
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Store the output dvecOut1 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut2 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * enable2ndWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut3 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 2 * enable3rdWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut4 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 3 * enable4thWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         remainingOutCh * enable4thWidth);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable4thWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+        else /* Store to accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1);
+          xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1);
+          xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1);
+          xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1);
+
+          xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2);
+          xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2);
+          xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2);
+          xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2);
+
+          xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3);
+          xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3);
+          xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3);
+          xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3);
+
+          xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4);
+          xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4);
+          xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4);
+          xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4);
+
+
+          /* Store the hvecAcc1 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc2 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * enable2ndWidth));
+          IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc3 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 2 * enable3rdWidth));
+          IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the  hvecAcc4 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 3 * enable4thWidth));
+          IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+        }
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/*****************************************************************************
+*  xaiPartialConvolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH   \
+*  xaiPartialConvolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for MxN MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate MxN MOD_DWH 3D partial */
+/*               dilated convolution function and MxN MOD_DWH 3D VQ partial */
+/*               dilated convolution function                               */
+/*               Stride values = 1, 2 and 4 are supported                   */
+/*               Implementation also supports dilation >= 1 for stride = 1  */
+/*               and dilation = 1 for stride = 2, 4                         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Accumulator Tile, Output Tile                              */
+/* Assumptions : InData are U8, CoeffData are S8                            */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Accumulated value will be within 24bit range               */
+/****************************************************************************/
+#ifdef DILATED_VQ_CONV_PARTIAL
+XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_U8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_POINTER(biasArray);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE,   \
+                    "\nKernel height = %d and width = %d\nKernel width and height should be less than or equal to 64", \
+                    XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile));
+    XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param);
+    XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) &&                                      \
+                    ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG,                    \
+                    "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) == 1) ||                                                            \
+                    ((XAI_CNN_CONV_GET_DILATIONX(param) >= 1) &&                                                           \
+                     (XAI_CNN_CONV_GET_STRIDEX(param) == 1)), XAI_ERR_BADARG,                                              \
+                    "\nDilationX = %hhu\nDilationX should be 1. It can be greater than 1 only when strideX is equal to 1", \
+                    XAI_CNN_CONV_GET_DILATIONX(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONY(param) == 1) ||                                                            \
+                    ((XAI_CNN_CONV_GET_DILATIONY(param) >= 1) &&                                                           \
+                     (XAI_CNN_CONV_GET_STRIDEY(param) == 1)), XAI_ERR_BADARG,                                              \
+                    "\nDilationY = %hhu\nDilationY should be 1. It can be greater than 1 only when strideY is equal to 1", \
+                    XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV_PARTIAL
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE,                                                      \
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+
+    if (XAI_CNN_CONV_GET_FLAG_INPUT(param))
+    {
+      XAI_CHECK_ARRAY_S32(biasArray);
+    }
+    if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+    {
+      XAI_CHECK_TILE3D_S32(accTile);
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(accTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, accTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, accTile);
+      XAI_CHECK_TILE3D_DATA_ORDER(accTile, XAI_DWH);
+      XAI_CHECK_TILE3D_SIZE_EQ(accTile, outTile);
+    }
+    if (XAI_CNN_CONV_GET_FLAG_OUTPUT(param))
+    {
+      XAI_CHECK_TILE3D(outTile);
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+      if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param)))
+      {
+        XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(accTile, outTile);
+      }
+    }
+  }
+#ifndef DILATED_VQ_CONV_PARTIAL
+  if ((XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) && \
+      XAI_CNN_CONV_GET_FLAG_OUTPUT(param))
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Calling further optimized function if dilation = 1 and (no edges along depth or kernelWidth = 1)*/
+  if ((XAI_CNN_CONV_GET_DILATIONX(param) == 1) &&                            \
+      ((XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile)) || \
+       (XAI_TILE4D_GET_DIM3(coeffTile) == 1)))
+  {
+    if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0)
+    {
+#ifdef DILATED_VQ_CONV_PARTIAL
+      partialConvolvedVQ3D_S_MxNd1_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(inTile, \
+                                                                                   coeffTile, biasArray, outputScaleArray, accTile, outTile, param);
+#else
+      partialConvolved3D_S_MxNd1_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(inTile, \
+                                                                                 coeffTile, biasArray, accTile, outTile, param);
+#endif
+    }
+    else
+    {
+#ifdef DILATED_VQ_CONV_PARTIAL
+      partialConvolvedVQ3D_S_MxNd1_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(inTile, \
+                                                                                coeffTile, biasArray, outputScaleArray, accTile, outTile, param);
+#else
+      partialConvolved3D_S_MxNd1_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(inTile, \
+                                                                              coeffTile, biasArray, accTile, outTile, param);
+#endif
+    }
+  }
+  else
+  {
+#ifdef DILATED_VQ_CONV_PARTIAL
+    partialConvolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(inTile, \
+                                                           coeffTile, biasArray, outputScaleArray, accTile, outTile, param);
+#else
+    partialConvolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(inTile, \
+                                                         coeffTile, biasArray, accTile, outTile, param);
+#endif
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for MxN MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate MxN MOD_DWH 3D partial */
+/*               dilated convolution function and MxN MOD_DWH 3D VQ partial */
+/*               dilated convolution function                               */
+/*               Stride values = 1, 2 and 4 are supported                   */
+/*               Implementation also supports dilation >= 1 for stride = 1  */
+/*               and dilation = 1 for stride = 2, 4                         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Accumulator Tile, Output Tile                              */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Accumulated value will be within 32bit range               */
+/****************************************************************************/
+#ifdef DILATED_VQ_CONV_PARTIAL
+XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_QM32(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH_QM32(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_POINTER(biasArray);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE,   \
+                    "\nKernel height = %d and width = %d\nKernel width and height should be less than or equal to 64", \
+                    XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile));
+    XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param);
+    XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) &&                                      \
+                    ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG,                    \
+                    "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) == 1) ||                                                           \
+                    ((XAI_CNN_CONV_GET_DILATIONX(param) >= 1) &&                                                          \
+                     (XAI_CNN_CONV_GET_STRIDEX(param) == 1)), XAI_ERR_BADARG,                                             \
+                    "\nDilationX = %hhu\nDilationX should be 1. It can be greater than 1 only when stride is equal to 1", \
+                    XAI_CNN_CONV_GET_DILATIONX(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONY(param) == 1) ||                                                           \
+                    ((XAI_CNN_CONV_GET_DILATIONY(param) >= 1) &&                                                          \
+                     (XAI_CNN_CONV_GET_STRIDEY(param) == 1)), XAI_ERR_BADARG,                                             \
+                    "\nDilationY = %hhu\nDilationY should be 1. It can be greater than 1 only when stride is equal to 1", \
+                    XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV_PARTIAL
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE,                                                      \
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+
+    if (XAI_CNN_CONV_GET_FLAG_INPUT(param))
+    {
+      XAI_CHECK_ARRAY_S32(biasArray);
+    }
+    if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+    {
+      XAI_CHECK_TILE3D_S32(accTile);
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(accTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, accTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, accTile);
+      XAI_CHECK_TILE3D_DATA_ORDER(accTile, XAI_DWH);
+      XAI_CHECK_TILE3D_SIZE_EQ(accTile, outTile);
+    }
+    if (XAI_CNN_CONV_GET_FLAG_OUTPUT(param))
+    {
+      XAI_CHECK_TILE3D(outTile);
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+      if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param)))
+      {
+        XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(accTile, outTile);
+      }
+    }
+  }
+#ifndef DILATED_VQ_CONV_PARTIAL
+  if ((XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) && \
+      XAI_CNN_CONV_GET_FLAG_OUTPUT(param))
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+
+  /* Calling further optimized function if dilation = 1 and (no edges along depth or kernelWidth = 1)*/
+  if ((XAI_CNN_CONV_GET_DILATIONX(param) == 1) &&                            \
+      ((XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile)) || \
+       (XAI_TILE4D_GET_DIM3(coeffTile) == 1)))
+  {
+    if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0)
+    {
+#ifdef DILATED_VQ_CONV_PARTIAL
+      partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_MOD_DWH_QM32_contiguous_depth_x4(inTile, \
+                                                                              coeffTile, biasArray, outputScaleArray, accTile, outTile, param);
+#else
+      partialConvolved3D_S_MxNd1_S8S8IXCa2_MOD_DWH_QM32_contiguous_depth_x4(inTile, \
+                                                                            coeffTile, biasArray, accTile, outTile, param);
+#endif
+    }
+    else
+    {
+#ifdef DILATED_VQ_CONV_PARTIAL
+      partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_MOD_DWH_QM32_contiguous_depth(inTile, \
+                                                                           coeffTile, biasArray, outputScaleArray, accTile, outTile, param);
+#else
+      partialConvolved3D_S_MxNd1_S8S8IXCa2_MOD_DWH_QM32_contiguous_depth(inTile, \
+                                                                         coeffTile, biasArray, accTile, outTile, param);
+#endif
+    }
+  }
+  else
+  {
+#ifdef DILATED_VQ_CONV_PARTIAL
+    partialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_QM32(inTile, \
+                                                      coeffTile, biasArray, outputScaleArray, accTile, outTile, param);
+#else
+    partialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH_QM32(inTile, \
+                                                    coeffTile, biasArray, accTile, outTile, param);
+#endif
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D partial convolution      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t dilationX     = 1;
+  const uint8_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t inputFlag     = XAI_CNN_CONV_GET_FLAG_INPUT(param);
+  const uint8_t outputFlag    = XAI_CNN_CONV_GET_FLAG_OUTPUT(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  int32_t * pAccData = NULL;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile);
+  }
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Pitch of AccTile Data (DWH) in dim1 and dim2 */
+  int32_t accDataPitch1 = 0;
+  int32_t accDataPitch2 = 0;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile);
+    accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile);
+  }
+
+  int32_t numIter = kWidthU * numInCh;
+
+  int32_t dilatedKWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1;
+  int32_t leftEdge, topEdge;
+  if ((dilatedKWidthU % 2) != 0)
+  {
+    leftEdge = dilatedKWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1);
+  }
+
+  if ((dilatedKHeightU % 2) != 0)
+  {
+    topEdge = dilatedKHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecData1;
+  xb_vec2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecData3;
+  xb_vec2Nx8* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vecN_2x32v* restrict phvecAcc;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (y = 0; y < outH; y++)  /* Image Height */
+  {                           /* walk down the rows */
+    for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+    { /* walk across the kernels */
+      /* To handle corner case when number of output channels
+       * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+      int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV_PARTIAL
+      xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+      /*Load output scale values*/
+      xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) (pScale + outCh);
+      VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+      for (x = 0; x < outW; x += 4) /* Image Width */
+      {                             /* walk across the columns */
+        int32_t enable2ndWidth = XT_SALT(1, outW - x);
+        int32_t enable3rdWidth = XT_SALT(2, outW - x);
+        int32_t enable4thWidth = XT_SALT(3, outW - x);
+        /* Output Data pointer */
+        int8_t *pOut  = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2);
+
+        /* Initialize accumulators */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        if (inputFlag) /* Bias Values */
+        {
+          phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+          ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+        }
+        else  /* Accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH;
+          xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH;
+          xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH;
+          xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH;
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          valign vaAcc = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL);
+          IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * enable2ndWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL);
+          IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 2 * enable3rdWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL);
+          IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 3 * enable4thWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL);
+          IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL);
+        }
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * enable2ndWidth);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * 2 * enable3rdWidth);
+          pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * 3 * enable4thWidth);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8_PP(pdvecData4);
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+          for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          }   /* End Input Channels */
+        } /* End Kernel Height * Width */
+
+        if (outputFlag)  /* Store to ouput Tile*/
+        {
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV_PARTIAL
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Store the output dvecOut1 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut2 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * enable2ndWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut3 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 2 * enable3rdWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut4 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 3 * enable4thWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         remainingOutCh * enable4thWidth);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable4thWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+        else /* Store to accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1);
+          xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1);
+          xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1);
+          xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1);
+
+          xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2);
+          xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2);
+          xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2);
+          xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2);
+
+          xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3);
+          xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3);
+          xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3);
+          xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3);
+
+          xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4);
+          xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4);
+          xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4);
+          xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4);
+
+
+          /* Store the hvecAcc1 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc2 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * enable2ndWidth));
+          IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc3 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 2 * enable3rdWidth));
+          IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the  hvecAcc4 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 3 * enable4thWidth));
+          IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+        }
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D partial convolution      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW     = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH     = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t dilationX     = 1;
+  const uint8_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t inputFlag     = XAI_CNN_CONV_GET_FLAG_INPUT(param);
+  const uint8_t outputFlag    = XAI_CNN_CONV_GET_FLAG_OUTPUT(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  int32_t * pAccData = NULL;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile);
+  }
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Pitch of AccTile Data (DWH) in dim1 and dim2 */
+  int32_t accDataPitch1 = 0;
+  int32_t accDataPitch2 = 0;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile);
+    accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile);
+  }
+
+  int32_t numIter = kWidthU * numInCh;
+
+  int32_t dilatedKWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1;
+  int32_t leftEdge, topEdge;
+  if ((dilatedKWidthU % 2) != 0)
+  {
+    leftEdge = dilatedKWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1);
+  }
+
+  if ((dilatedKHeightU % 2) != 0)
+  {
+    topEdge = dilatedKHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t outCh, x, y, ky, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecData1;
+  xb_vec2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecData3;
+  xb_vec2Nx8* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vecN_2x32v* restrict phvecAcc;
+
+  /*
+   * inCh and kWidth loops are combined. Assumed that the
+   * edges along Depth dimension of input data is zero and also
+   * edges along depth dimension of coefficient data is zero.
+   */
+
+  /* Loops Start */
+  for (y = 0; y < outH; y++)  /* Image Height */
+  {                           /* walk down the rows */
+    for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+    { /* walk across the kernels */
+      /* To handle corner case when number of output channels
+       * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+      int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV_PARTIAL
+      xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+      /*Load output scale values*/
+      xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) (pScale + outCh);
+      VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+
+      for (x = 0; x < outW; x += 4) /* Image Width */
+      {                             /* walk across the columns */
+        /* Variable to handle corner case when width is odd */
+        int32_t enable2ndWidth = XT_SALT(1, outW - x);
+        int32_t enable3rdWidth = XT_SALT(2, outW - x);
+        int32_t enable4thWidth = XT_SALT(3, outW - x);
+
+        /* Output Data pointer */
+        int8_t *pOut  = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2);
+
+        /* Initialize accumulators */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        if (inputFlag) /* Bias Values */
+        {
+          phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+          ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+        }
+        else  /* Accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH;
+          xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH;
+          xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH;
+          xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH;
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          valign vaAcc = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL);
+          IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * enable2ndWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL);
+          IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 2 * enable3rdWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL);
+          IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 3 * enable4thWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL);
+          IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL);
+        }
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+#ifdef __XCC__
+#pragma loop_count min=1
+#endif
+        for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */
+        {
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * enable2ndWidth);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * 2 * enable3rdWidth);
+          pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * 3 * enable4thWidth);
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3);
+
+          /* Primes for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8_PP(pdvecData4);
+
+          for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */
+          {
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          }   /* End Input Channels */
+          /* Corner case handling as numIter is not a multiple of 4 */
+          if (k < numIter)
+          {
+            int32_t remInCh = numIter - k;
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          }   /* End if( k < numIter)*/
+        } /* End Kernel Height * Width */
+
+        if (outputFlag)  /* Store to ouput Tile*/
+        {
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV_PARTIAL
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Store the output dvecOut1 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut2 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * enable2ndWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut3 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 2 * enable3rdWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut4 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 3 * enable4thWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \
+                         remainingOutCh * enable4thWidth);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable4thWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+        else /* Store to accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1);
+          xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1);
+          xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1);
+          xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1);
+
+          xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2);
+          xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2);
+          xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2);
+          xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2);
+
+          xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3);
+          xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3);
+          xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3);
+          xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3);
+
+          xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4);
+          xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4);
+          xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4);
+          xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4);
+
+
+          /* Store the hvecAcc1 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc2 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * enable2ndWidth));
+          IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc3 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 2 * enable3rdWidth));
+          IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the  hvecAcc4 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 3 * enable4thWidth));
+          IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+        }
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/****************************************************************************/
+/* Description : P6 optimized implementation of 3D partial convolution      */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               CNN convolution params structure                           */
+/* InOuts      : Output Tile                                                */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Edges along Depth dimension in inTile and coeffTile        */
+/*               are zero.                                                  */
+/****************************************************************************/
+#ifdef DILATED_VQ_CONV_PARTIAL
+static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+static _XAI_INLINE_ void partialConvolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t outW      = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outH      = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t numInCh   = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh  = XAI_TILE3D_GET_DIM1(outTile);
+  const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);
+  const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);
+
+  /* Kernel Size (NDWH) */
+  const int32_t kWidthU   = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU  = XAI_TILE4D_GET_DIM4(coeffTile);
+  int32_t dilatedkWidthU  = dilationX * (kWidthU - 1) + 1;
+  int32_t dilatedkHeightU = dilationY * (kHeightU - 1) + 1;
+
+  /* CNN convolution parameters */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t inputFlag     = XAI_CNN_CONV_GET_FLAG_INPUT(param);
+  const uint8_t outputFlag    = XAI_CNN_CONV_GET_FLAG_OUTPUT(param);
+
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pInData    = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pOutData   = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  int32_t * pAccData = NULL;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile);
+  }
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */
+  const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+
+  /* Pitches of Input Data (DWH) in dim1 and dim2 */
+  const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  /* Pitch of Output Data (DWH) in dim1 and dim2 */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Pitch of AccTile Data (DWH) in dim1 and dim2 */
+  int32_t accDataPitch1 = 0;
+  int32_t accDataPitch2 = 0;
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile);
+    accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile);
+  }
+
+  int32_t leftEdge, topEdge;
+  if ((dilatedkWidthU % 2) != 0)
+  {
+    leftEdge = dilatedkWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedkWidthU / 2) : ((dilatedkWidthU / 2) - 1);
+  }
+
+  if ((dilatedkHeightU % 2) != 0)
+  {
+    topEdge = dilatedkHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedkHeightU / 2) : ((dilatedkHeightU / 2) - 1);
+  }
+
+
+  /* Move pointer to the start of the data (including edge) */
+  pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)];
+
+  /* Setting the limits for output data according to ReLu Flag and outTileType */
+  int32_t minLim, maxLim;
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \
+             SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0);
+    maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \
+             : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX);
+  }
+  const int8_t typeFlag       = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0;
+  const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile);
+
+  /* Variable Declarations */
+  int32_t inCh, outCh, x, y, k;
+  valign vaOutData = IVP_ZALIGN();
+
+  xb_vecN_2x32v* restrict phvecBias;
+  xb_vec2Nx8* restrict pdvecCoeff;
+  xb_vec2Nx8* restrict pdvecData1;
+  xb_vec2Nx8* restrict pdvecData2;
+  xb_vec2Nx8* restrict pdvecData3;
+  xb_vec2Nx8* restrict pdvecData4;
+  xb_vec2Nx8* restrict pdvecOut;
+  xb_vecN_2x32v* restrict phvecAcc;
+
+  /* Loops Start */
+  for (y = 0; y < outH; y++) /* Image Height */
+  {                          /* walk down the rows */
+    for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH)
+    { /* walk across the kernels */
+      /* To handle corner case when number of output channels
+       * is not a multiple of  2 * XCHAL_IVPN_SIMD_WIDTH*/
+      int32_t remainingOutCh = numOutCh - outCh;
+#ifdef DILATED_VQ_CONV_PARTIAL
+      xb_vecNx16U outScaleDataEven, outScaleDataOdd;
+      /*Load output scale values*/
+      xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) (pScale + outCh);
+      VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd);
+#endif
+      for (x = 0; x < outW; x += 4) /* Image Width */
+      {                             /* walk across the columns */
+        int32_t enable2ndWidth = XT_SALT(1, outW - x);
+        int32_t enable3rdWidth = XT_SALT(2, outW - x);
+        int32_t enable4thWidth = XT_SALT(3, outW - x);
+        /* Output Data pointer */
+        int8_t *pOut  = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel;
+        int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2);
+
+        /* Initialize accumulators with bias values */
+        xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4;
+        if (inputFlag) /* Bias Values */
+        {
+          phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh);
+          ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4);
+        }
+        else  /* Accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH;
+          xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH;
+          xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH;
+          xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH;
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          valign vaAcc = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL);
+          IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * enable2ndWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL);
+          IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 2 * enable3rdWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL);
+          IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL);
+
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 3 * enable4thWidth);
+          vaAcc    = IVP_LAN_2X32_PP(phvecAcc);
+          IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh);
+          IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL);
+          IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL);
+        }
+
+        /* Input Data and Coeff Data Pointers */
+        int8_t *pData  = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2;
+        int8_t *pCoeff = pCoeffData + outCh;
+
+        xb_vecN_2x32v hvecInAddrOff    = 0;
+        xb_vecN_2x32v hvecCoeffAddrOff = 0;
+        xb_vecN_2x32v hvecLaneIdx      = 0;
+        int32_t inAddrOff, coeffAddrOff;
+
+        for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */
+        {
+          /* Condition checks performed to get the Input and Coefficient        */
+          /* Pointer Offsets after combining the Kernel Width and Height Loops  */
+          vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU);
+          /* hvecLaneIdx will be reset to zero after every kWidth */
+          hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2);
+          /* InPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2);
+          /* CoeffPitch added after every kWidth */
+          IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2);
+          /* Extracting Input and Coefficient address offsets */
+          inAddrOff        = IVP_EXTRN_2X32(hvecInAddrOff, 0);
+          coeffAddrOff     = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0);
+          hvecLaneIdx      = IVP_ADDN_2X32(hvecLaneIdx, 1);
+          hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2);
+          hvecInAddrOff    = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX);
+
+          /* Pointers for Input Data Loads */
+          pdvecData1 = (xb_vec2Nx8 *) (pData + inAddrOff);
+          pdvecData2 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * enable2ndWidth);
+          pdvecData3 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * 2 * enable3rdWidth);
+          pdvecData4 = (xb_vec2Nx8 *) (pData + inAddrOff + (strideX * inDataPitch1 * 3 * enable4thWidth));
+
+          /* Pointer for Coefficient Load */
+          pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff);
+
+          /* Primes registers for Aligning Load */
+          valign vaData1 = IVP_LA2NX8_PP(pdvecData1);
+          valign vaData2 = IVP_LA2NX8_PP(pdvecData2);
+          valign vaData3 = IVP_LA2NX8_PP(pdvecData3);
+          valign vaData4 = IVP_LA2NX8_PP(pdvecData4);
+
+          for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */
+          {
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+            /* Aligned Vector Loads of coefficients */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+            xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          } /* End Input Channels */
+
+          /* Corner Case Handling if number of input channels not multiple of 4 */
+          if (inCh < numInCh)
+          {
+            int32_t remInCh = numInCh - inCh;
+
+            /* Aligning variable vector load of pixels */
+            xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh);
+            xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh);
+            xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh);
+            xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh);
+
+            /* Extracting first 4 bytes of vector into address register */
+            /* Scalar integers to be used for QMUL                      */
+            int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData1)), 0);
+            int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData2)), 0);
+            int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData3)), 0);
+            int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \
+                                                   (IVP_MOVNX16_FROM2NX8(dvecData4)), 0);
+
+            /* For conditional coefficient loads */
+            int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */
+            int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */
+
+            /* Coefficient Loads */
+            xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2);
+            xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3);
+            xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1);
+
+            IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1);
+            IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2);
+            IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3);
+            IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4);
+          } /* End Corner case handling */
+        }   /* End Kernel Height * Width */
+
+        if (outputFlag)  /* Store to ouput Tile*/
+        {
+          /* Pack, Output Scale, Output Shift and clamping */
+          xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L;
+          xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H;
+#ifdef DILATED_VQ_CONV_PARTIAL
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                           outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \
+                                        outScale, outShiftU, minLim, maxLim, typeFlag);
+#endif
+          /* Store the output dvecOut1 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh);
+          IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut2 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * enable2ndWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth);
+          IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut3 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 2 * enable3rdWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth);
+          IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+
+          /* Store the output dvecOut4 along the output depth */
+          pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 3 * enable4thWidth) * bytesPerPixel);
+          IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable4thWidth);
+          IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \
+                         (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable4thWidth);
+          IVP_SAPOS2NX8_FP(vaOutData, pdvecOut);
+        }
+        else /* Store to accumulator tile*/
+        {
+          xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1);
+          xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1);
+          xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1);
+          xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1);
+
+          xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2);
+          xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2);
+          xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2);
+          xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2);
+
+          xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3);
+          xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3);
+          xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3);
+          xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3);
+
+          xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4);
+          xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4);
+          xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4);
+          xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4);
+
+
+          /* Store the hvecAcc1 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc2 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * enable2ndWidth));
+          IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the hvecAcc3 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 2 * enable3rdWidth));
+          IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+
+          /* Store the  hvecAcc4 along the accTile depth */
+          phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 3 * enable4thWidth));
+          IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh);
+          IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH);
+          IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc);
+        }
+      } /* End image width */
+    }   /* End image height */
+  }     /* End Output Channels */
+}
+
+/*****************************************************************************
+*  xaiPartialConvolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH   \
+*  xaiPartialConvolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH
+*  **************************************************************************/
+
+/****************************************************************************/
+/* Description : P6 optimized generic implementation for MxN MOD_DWH        */
+/*               3D convolution. Based on pre-processor specifiers. Code    */
+/*               implementation is generated during preprocessing stage.    */
+/*               This method can be used to generate MxN MOD_DWH 3D partial */
+/*               dilated convolution function and MxN MOD_DWH 3D VQ partial */
+/*               dilated convolution function                               */
+/*               Stride values = 1, 2 and 4 are supported                   */
+/*               Implementation also supports dilation >= 1 for stride = 1  */
+/*               and dilation = 1 for stride = 2, 4                         */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array,              */
+/*               Output scale array, CNN convolution params structure       */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Accumulator Tile, Output Tile                              */
+/* Assumptions : InData, CoeffData are S8                                   */
+/*               biasArray is signed 32b, value not exceeding signed 24b    */
+/*               Output scale array is U16                                  */
+/*               OutData is S8 / U8 / S16                                   */
+/*               Kernel Size is MxNxDxNk. M and N sizes are less than or    */
+/*               equal to 16.                                               */
+/*               Input and Output are in DWH format                         */
+/*               Coeff is in NDWH format                                    */
+/*               CoeffDim1Pitch is aligned to 2N (Ca2)                      */
+/*               Accumulated value will be within 24bit range               */
+/****************************************************************************/
+#ifdef DILATED_VQ_CONV_PARTIAL
+XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  const xai_pArray outputScaleArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#else
+XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(
+  const xai_pTile3D inTile,
+  const xai_pTile4D coeffTile,
+  const xai_pArray biasArray,
+  xai_pTile3D accTile,
+  xai_pTile3D outTile,
+  const xai_cnn_conv_params *param
+  )
+#endif
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S8(inTile);
+    XAI_CHECK_CONV_OUTPUT_TILE3D(outTile);
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_POINTER(biasArray);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE,   \
+                    "\nKernel height = %d and width = %d\nKernel width and height should be less than or equal to 64", \
+                    XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile));
+    XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param);
+    XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) &&                                      \
+                    ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG,                    \
+                    "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) == 1) ||                                                            \
+                    ((XAI_CNN_CONV_GET_DILATIONX(param) >= 1) &&                                                           \
+                     (XAI_CNN_CONV_GET_STRIDEX(param) == 1)), XAI_ERR_BADARG,                                              \
+                    "\nDilationX = %hhu\nDilationX should be 1. It can be greater than 1 only when strideX is equal to 1", \
+                    XAI_CNN_CONV_GET_DILATIONX(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONY(param) == 1) ||                                                            \
+                    ((XAI_CNN_CONV_GET_DILATIONY(param) >= 1) &&                                                           \
+                     (XAI_CNN_CONV_GET_STRIDEY(param) == 1)), XAI_ERR_BADARG,                                              \
+                    "\nDilationY = %hhu\nDilationY should be 1. It can be greater than 1 only when strideY is equal to 1", \
+                    XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV_PARTIAL
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE,                                                      \
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+
+    if (XAI_CNN_CONV_GET_FLAG_INPUT(param))
+    {
+      XAI_CHECK_ARRAY_S32(biasArray);
+    }
+    if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+    {
+      XAI_CHECK_TILE3D_S32(accTile);
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(accTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, accTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, accTile);
+      XAI_CHECK_TILE3D_DATA_ORDER(accTile, XAI_DWH);
+      XAI_CHECK_TILE3D_SIZE_EQ(accTile, outTile);
+    }
+    if (XAI_CNN_CONV_GET_FLAG_OUTPUT(param))
+    {
+      XAI_CHECK_TILE3D(outTile);
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+      if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param)))
+      {
+        XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(accTile, outTile);
+      }
+    }
+  }
+#ifndef DILATED_VQ_CONV_PARTIAL
+  if ((XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) && \
+      XAI_CNN_CONV_GET_FLAG_OUTPUT(param))
+  {
+    int32_t fillValue;
+    int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param);
+    fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0;
+    return(xaiFillTile3D(outTile, fillValue, 0));
+  }
+#endif
+  /* Calling further optimized function if dilation = 1 and (no edges along depth or kernelWidth = 1)*/
+  if ((XAI_CNN_CONV_GET_DILATIONX(param) == 1) &&                            \
+      ((XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile)) || \
+       (XAI_TILE4D_GET_DIM3(coeffTile) == 1)))
+  {
+    if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0)
+    {
+#ifdef DILATED_VQ_CONV_PARTIAL
+      partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(inTile, \
+                                                                                   coeffTile, biasArray, outputScaleArray, accTile, outTile, param);
+#else
+      partialConvolved3D_S_MxNd1_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(inTile, \
+                                                                                 coeffTile, biasArray, accTile, outTile, param);
+#endif
+    }
+    else
+    {
+#ifdef DILATED_VQ_CONV_PARTIAL
+      partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(inTile, \
+                                                                                coeffTile, biasArray, outputScaleArray, accTile, outTile, param);
+#else
+      partialConvolved3D_S_MxNd1_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(inTile, \
+                                                                              coeffTile, biasArray, accTile, outTile, param);
+#endif
+    }
+  }
+  else
+  {
+#ifdef DILATED_VQ_CONV_PARTIAL
+    partialConvolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(inTile, \
+                                                           coeffTile, biasArray, outputScaleArray, accTile, outTile, param);
+#else
+    partialConvolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(inTile, \
+                                                         coeffTile, biasArray, accTile, outTile, param);
+#endif
+  }
+  return(XAI_ERROR_STATUS());
+}
+#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD_S16.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD_S16.c
new file mode 100644
index 00000000000..2adb107cd07
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD_S16.c
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+#include "limits.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#undef DILATED_VQ_CONV_PARTIAL
+#include "cnn_dilated_conv_partial_MOD_S16.h"
+#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD_S16.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD_S16.h
new file mode 100644
index 00000000000..f2ce98ced0d
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD_S16.h
@@ -0,0 +1,878 @@
+/*
+ * Copyright (c) 2023 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+#include "limits.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+/********* partialConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth ***********/
+/********** partialConvolved3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth ************/
+/***********************************************************************************/
+/* Description : Specialized optimized implementation for partial 3D convolution   */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array, Output Scale Array, */
+/*               CNN convolution params structure                                  */
+/* Outputs     :                                                                   */
+/* InOuts      : Accumulator Tile, Output Tile                                     */
+/* Assumptions : InData, CoeffData are S16                                         */
+/*               OutData is U16 / S16                                              */
+/*               Input is in DWH and Output is in DWH format                       */
+/*               Coeff is in NDWH format                                           */
+/*               Input does not have edges along the depth dimension               */
+/*               dilationX = dilationY = 1 always                                  */
+/*               Accumulated value will be within 48-bit range                     */
+/***********************************************************************************/
+#ifdef DILATED_VQ_CONV_PARTIAL
+static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth(const xai_pTile3D inTile,
+                                                                                       const xai_pTile4D coeffTile,
+                                                                                       const xai_pArray biasArray,
+                                                                                       const xai_pArray outputScaleArray,
+                                                                                       xai_pTile3D accTile,
+                                                                                       xai_pTile3D outTile,
+                                                                                       const xai_cnn_conv_params *param)
+#else
+static _XAI_INLINE_ void partialConvolved3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth(const xai_pTile3D inTile,
+                                                                                     const xai_pTile4D coeffTile,
+                                                                                     const xai_pArray biasArray,
+                                                                                     xai_pTile3D accTile,
+                                                                                     xai_pTile3D outTile,
+                                                                                     const xai_cnn_conv_params *param)
+#endif
+{
+  /* Getting parameters from the tile structures */
+  const int32_t numInCh         = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh        = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outWidth        = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outHeight       = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t inDataPitch1    = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2    = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outDataPitch1   = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2   = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+  const int32_t coeffDataPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffDataPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+  const int32_t kWidthU         = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU        = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* Convolution params */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t inputFlag     = XAI_CNN_CONV_GET_FLAG_INPUT(param);
+  const uint8_t outputFlag    = XAI_CNN_CONV_GET_FLAG_OUTPUT(param);
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  const uint16_t *pOutputScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Data Pointers of input, coefficient, biasData */
+  const int16_t *pInData    = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  const int16_t *pCoeffData = (int16_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  const int64_t *pBiasData  = (int64_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  /* Data Pointers of output and scratch buffer data */
+  int16_t *pOutData = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int64_t *pAccData = NULL;
+
+  int32_t accDataPitch1 = 0;
+  int32_t accDataPitch2 = 0;
+
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    pAccData      = (int64_t *) XAI_TILE3D_GET_DATA_PTR(accTile);
+    accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile);
+    accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile);
+  }
+
+  int32_t leftEdge, topEdge;
+
+  if ((kWidthU % 2) != 0)
+  {
+    leftEdge = kWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1);
+  }
+
+  if ((kHeightU % 2) != 0)
+  {
+    topEdge = kHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1);
+  }
+
+  /* move to start of edge data only when input is already padded. */
+  pInData = &pInData[-(int32_t) ((topEdge) * inDataPitch2 + (leftEdge) * inDataPitch1)];
+
+  /* Setting the limits for output data according to ReLu is enabled or not*/
+  int32_t minLim, maxLim;
+
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MIN : 0);
+    maxLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX : USHRT_MAX);
+  }
+
+  int32_t outCh, x, y, ky, numIter, iter;
+
+  numIter = (numInCh * kWidthU);
+
+  xb_vecN_2x32v *restrict phvecIn1;
+  xb_vecN_2x32v *restrict phvecIn2;
+  xb_vecN_2x32v *restrict phvecIn3;
+  xb_vecN_2x32v *restrict phvecIn4;
+  xb_vecNx16 *restrict pvecCoeff;
+  xb_vec2Nx8 *restrict pdvecBias;
+  xb_vec2Nx8 *restrict pdvecAccData;
+  xb_vecNx16 *restrict pvecOut;
+
+  xb_vecNx48 vecAcc1 = 0, vecAcc2 = 0, vecAcc3 = 0, vecAcc4 = 0, vecBias = 0;
+  xb_vecN_2x32v hvecIn1, hvecIn2, hvecIn3, hvecIn4;
+  xb_vecNx16 vecCoeff1, vecCoeff2;
+  xb_vecNx16 vecOut1, vecOut2, vecOut3, vecOut4;
+  xb_vec2Nx8 dvecAccLL, dvecAccLH, dvecAccHL, dvecAccHH;
+
+  valign vaIn1, vaIn2, vaIn3, vaIn4, vaBias, vaAcc;
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  xb_vecNx16U vecOutScaleU;
+  xb_vecNx16U *restrict pvecOutScaleData = (xb_vecNx16U *) (pOutputScaleData);
+  valign vaScale                         = IVP_LANX16U_PP(pvecOutScaleData);
+#endif
+
+  pdvecBias = (xb_vec2Nx8 *) (pBiasData);
+  vaBias    = IVP_LA2NX8_PP(pdvecBias);
+  valign vaOut = IVP_ZALIGN();
+
+  for (outCh = 0; outCh < numOutCh; outCh += XCHAL_IVPN_SIMD_WIDTH)
+  {
+    int32_t remOutCh = (numOutCh - outCh);
+    /* Initially the accumulators with the 48-bit bias values */
+    if (inputFlag) // Biases will be loaded only when "inputFlag" is set
+    {
+      ACC_INIT_BIAS64_MOD_ONEACC(pdvecBias, vaBias, remOutCh, vecBias);
+    }
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+    IVP_LAVNX16U_XP(vecOutScaleU, vaScale, pvecOutScaleData, 2 * remOutCh);
+#endif
+
+    for (y = 0; y < outHeight; y += 2)
+    {
+      // Calculating "remY" for integrated tail-handling purpose
+      int32_t remY = XT_MIN(1, outHeight - y - 1);
+      for (x = 0; x < outWidth; x += 2)
+      {
+        // Calculating "remX" for integrated tail-handling purpose
+        int32_t remX    = XT_MIN(1, outWidth - x - 1);
+        int16_t *pData1 = (int16_t *) (pInData + (x * strideX * inDataPitch1) + (y * strideY * inDataPitch2));
+        int64_t *pAcc   = (int64_t *) (pAccData + outCh + (x * accDataPitch1) + (y * accDataPitch2));
+
+        if (inputFlag) // if "inputFlag" is set, then initialize the accumulators with the bias values
+        {
+          /* Initializing all the 4 accumulators with bias values before accumulating for every spatial location */
+          vecAcc4 = vecAcc3 = vecAcc2 = vecAcc1 = vecBias;
+        }
+        else // if "inputFlag" is not-set, then initialize the accumulators with the values stored in the accTile
+        {
+          // Loading accumulated values from W = 0, H = 0 spatial location initially
+          pdvecAccData = (xb_vec2Nx8 *) (pAcc);
+          vaAcc        = IVP_LA2NX8_PP(pdvecAccData);
+          IVP_LAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh));
+          IVP_LAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_LAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_LAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH));
+          vecAcc1 = IVP_CVT48UN_2X64L(dvecAccLH, dvecAccLL);
+          IVP_CVT48UN_2X64H(vecAcc1, dvecAccHH, dvecAccHL);
+
+          // Loading accumulated values form W = 1, H = 0 spatial location initially
+          pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remX * accDataPitch1));
+          vaAcc        = IVP_LA2NX8_PP(pdvecAccData);
+          IVP_LAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh));
+          IVP_LAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_LAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_LAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH));
+          vecAcc2 = IVP_CVT48UN_2X64L(dvecAccLH, dvecAccLL);
+          IVP_CVT48UN_2X64H(vecAcc2, dvecAccHH, dvecAccHL);
+
+          // Loading accumulated values form W = 0, H = 1 spatial location initially
+          pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remY * accDataPitch2));
+          vaAcc        = IVP_LA2NX8_PP(pdvecAccData);
+          IVP_LAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh));
+          IVP_LAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_LAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_LAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH));
+          vecAcc3 = IVP_CVT48UN_2X64L(dvecAccLH, dvecAccLL);
+          IVP_CVT48UN_2X64H(vecAcc3, dvecAccHH, dvecAccHL);
+
+          // Loading accumulated values form W = 1, H = 1 spatial location initially
+          pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remX * accDataPitch1) + (remY * accDataPitch2));
+          vaAcc        = IVP_LA2NX8_PP(pdvecAccData);
+          IVP_LAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh));
+          IVP_LAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_LAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_LAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH));
+          vecAcc4 = IVP_CVT48UN_2X64L(dvecAccLH, dvecAccLL);
+          IVP_CVT48UN_2X64H(vecAcc4, dvecAccHH, dvecAccHL);
+        }
+
+        for (ky = 0; ky < kHeightU; ky++)
+        {
+          // Adjusting the coefficient data pointer
+          pvecCoeff = (xb_vecNx16 *) (pCoeffData + outCh + (ky * coeffDataPitch3));
+          int16_t *pData2 = (int16_t *) (pData1 + (ky * inDataPitch2));
+          // phvecIn1 initially points to W = 0, H = 0 spatial location
+          phvecIn1 = (xb_vecN_2x32v *) (pData2);
+          // phvecIn2 initially points to W = 1, H = 0 spatial location
+          phvecIn2 = (xb_vecN_2x32v *) (pData2 + (remX * strideX * inDataPitch1));
+          // phvecIn3 initially points to W = 0, H = 1 spatial location
+          phvecIn3 = (xb_vecN_2x32v *) (pData2 + (remY * strideY * inDataPitch2));
+          // phvecIn4 initially points to W = 1, H = 1 spatial location
+          phvecIn4 = (xb_vecN_2x32v *) (pData2 + (remX * strideX * inDataPitch1) + (remY * strideY * inDataPitch2));
+
+          vaIn1 = IVP_LAN_2X32_PP(phvecIn1);
+          vaIn2 = IVP_LAN_2X32_PP(phvecIn2);
+          vaIn3 = IVP_LAN_2X32_PP(phvecIn3);
+          vaIn4 = IVP_LAN_2X32_PP(phvecIn4);
+
+          for (iter = 0; iter < (numIter - 1); iter += 2)
+          {
+            // hvecIn1 contains 4 bytes or 2 elements along D from W = 0, H = 0 spatial location initially
+            IVP_LAVN_2X32_XP(hvecIn1, vaIn1, phvecIn1, 4);
+            // hvecIn2 contains 4 bytes or 2 elements along D from W = 1, H = 0 spatial location initially
+            IVP_LAVN_2X32_XP(hvecIn2, vaIn2, phvecIn2, 4);
+            // hvecIn2 contains 4 bytes or 2 elements along D from W = 0, H = 1 spatial location initially
+            IVP_LAVN_2X32_XP(hvecIn3, vaIn3, phvecIn3, 4);
+            // hvecIn2 contains 4 bytes or 2 elements along D from W = 1, H = 1 spatial location initially
+            IVP_LAVN_2X32_XP(hvecIn4, vaIn4, phvecIn4, 4);
+
+            // vecCoeff1 contains 64 bytes or 32 elements along output depth (N) from initial input depth (D = 0) initially
+            IVP_L2UNX16_XP(vecCoeff1, pvecCoeff, (2 * coeffDataPitch1));
+            // vecCoeff2 contains 64 bytes or 32 elements along output depth (N) from next input depth (D = 1) initially
+            IVP_L2UNX16_XP(vecCoeff2, pvecCoeff, (2 * coeffDataPitch1));
+
+            // vecAcc1 contains 64 bytes or 32 elements along output depth (N) from W = 0, H = 0 spatial location initially
+            IVP_MULPAN16XR16(vecAcc1, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecIn1, 0));
+            // vecAcc2 contains 64 bytes or 32 elements along output depth (N) from W = 1, H = 0 spatial location initially
+            IVP_MULPAN16XR16(vecAcc2, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecIn2, 0));
+            // vecAcc3 contains 64 bytes or 32 elements along output depth (N) from W = 0, H = 1 spatial location initially
+            IVP_MULPAN16XR16(vecAcc3, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecIn3, 0));
+            // vecAcc4 contains 64 bytes or 32 elements along output depth (N) from W = 1, H = 1 spatial location initially
+            IVP_MULPAN16XR16(vecAcc4, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecIn4, 0));
+          } // End of for (iter = 0; iter < (numIter - 1); iter += 2)
+          if (iter < numIter)
+          {
+            // hvecIn1 contains 2 bytes or 1 element along D from W = 0, H = 0 spatial location initially
+            IVP_LAVN_2X32_XP(hvecIn1, vaIn1, phvecIn1, 2);
+            // hvecIn2 contains 2 bytes or 1 element along D from W = 1, H = 0 spatial location initially
+            IVP_LAVN_2X32_XP(hvecIn2, vaIn2, phvecIn2, 2);
+            // hvecIn2 contains 2 bytes or 1 element along D from W = 0, H = 1 spatial location initially
+            IVP_LAVN_2X32_XP(hvecIn3, vaIn3, phvecIn3, 2);
+            // hvecIn2 contains 2 bytes or 1 element along D from W = 1, H = 1 spatial location initially
+            IVP_LAVN_2X32_XP(hvecIn4, vaIn4, phvecIn4, 2);
+
+            // vecCoeff1 contains 64 bytes or 32 elements along output depth (N) from initial input depth (D = 0)
+            IVP_L2UNX16_XP(vecCoeff1, pvecCoeff, (2 * coeffDataPitch1));
+
+            // vecAcc1 contains 64 bytes or 32 elements along output depth (N) from W = 0, H = 0 spatial location initially
+            IVP_MULPAN16XR16(vecAcc1, 0, vecCoeff1, IVP_EXTRN_2X32(hvecIn1, 0));
+            // vecAcc2 contains 64 bytes or 32 elements along output depth (N) from W = 1, H = 0 spatial location initially
+            IVP_MULPAN16XR16(vecAcc2, 0, vecCoeff1, IVP_EXTRN_2X32(hvecIn2, 0));
+            // vecAcc3 contains 64 bytes or 32 elements along output depth (N) from W = 0, H = 1 spatial location initially
+            IVP_MULPAN16XR16(vecAcc3, 0, vecCoeff1, IVP_EXTRN_2X32(hvecIn3, 0));
+            // vecAcc4 contains 64 bytes or 32 elements along output depth (N) from W = 1, H = 1 spatial location initially
+            IVP_MULPAN16XR16(vecAcc4, 0, vecCoeff1, IVP_EXTRN_2X32(hvecIn4, 0));
+          }
+        } // End of for (ky = 0; ky < kHeightU; ky++)
+
+        if (outputFlag) // if "outputFlag" is set, apply pack, scale, shift, clamp logic on accumulated values and store the output
+        {
+          /* Pack, scale, shift, clamp logic to follow */
+#ifdef DILATED_VQ_CONV_PARTIAL
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut1, vecAcc1, packShiftAccU, vecOutScaleU, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut2, vecAcc2, packShiftAccU, vecOutScaleU, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut3, vecAcc3, packShiftAccU, vecOutScaleU, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut4, vecAcc4, packShiftAccU, vecOutScaleU, outShiftU, minLim, maxLim);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1, vecAcc1, packShiftAccU, outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2, vecAcc2, packShiftAccU, outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3, vecAcc3, packShiftAccU, outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4, vecAcc4, packShiftAccU, outScale, outShiftU, minLim, maxLim);
+#endif
+          // Storing 64 bytes or 32 elements along output depth (N) at W = 0, H = 0 initially
+          pvecOut = (xb_vecNx16 *) (pOutData + outCh + (x * outDataPitch1) + (y * outDataPitch2));
+          IVP_SAVNX16_XP(vecOut1, vaOut, pvecOut, (2 * remOutCh));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+
+          // Storing 64 bytes or 32 elements along output depth (N) at W = 1, H = 0 initially
+          pvecOut = (xb_vecNx16 *) (pOutData + outCh + ((x + remX) * outDataPitch1) + (y * outDataPitch2));
+          IVP_SAVNX16_XP(vecOut2, vaOut, pvecOut, (2 * remOutCh) * remX);
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+
+          // Storing 64 bytes or 32 elements along output depth (N) at W = 0, H = 1 initially
+          pvecOut = (xb_vecNx16 *) (pOutData + outCh + (x * outDataPitch1) + ((y + remY) * outDataPitch2));
+          IVP_SAVNX16_XP(vecOut3, vaOut, pvecOut, (2 * remOutCh) * remY);
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+
+          // Storing 64 bytes or 32 elements along output depth (N) at W = 1, H = 1 initially
+          pvecOut = (xb_vecNx16 *) (pOutData + outCh + ((x + remX) * outDataPitch1) + ((y + remY) * outDataPitch2));
+          IVP_SAVNX16_XP(vecOut4, vaOut, pvecOut, (2 * remOutCh) * remX * remY);
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+        else // if "outputFlag" is not-set, store the accumulated values to the accTile
+        {
+          vaAcc     = IVP_ZALIGN();
+          dvecAccLL = IVP_CVT64SNX48LL(vecAcc1);
+          dvecAccLH = IVP_CVT64SNX48LH(vecAcc1);
+          dvecAccHL = IVP_CVT64SNX48HL(vecAcc1);
+          dvecAccHH = IVP_CVT64SNX48HH(vecAcc1);
+          // Storing 32 elements at W = 0, H = 0 initially
+          pdvecAccData = (xb_vec2Nx8 *) (pAcc);
+          IVP_SAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh));
+          IVP_SAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaAcc, pdvecAccData);
+
+          dvecAccLL = IVP_CVT64SNX48LL(vecAcc2);
+          dvecAccLH = IVP_CVT64SNX48LH(vecAcc2);
+          dvecAccHL = IVP_CVT64SNX48HL(vecAcc2);
+          dvecAccHH = IVP_CVT64SNX48HH(vecAcc2);
+          // Storing 32 elements at W = 1, H = 0 initially
+          pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remX * accDataPitch1));
+          IVP_SAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh));
+          IVP_SAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaAcc, pdvecAccData);
+
+          dvecAccLL = IVP_CVT64SNX48LL(vecAcc3);
+          dvecAccLH = IVP_CVT64SNX48LH(vecAcc3);
+          dvecAccHL = IVP_CVT64SNX48HL(vecAcc3);
+          dvecAccHH = IVP_CVT64SNX48HH(vecAcc3);
+          // Storing 32 elements at W = 0, H = 1 initially
+          pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remY * accDataPitch2));
+          IVP_SAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh));
+          IVP_SAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaAcc, pdvecAccData);
+
+          dvecAccLL = IVP_CVT64SNX48LL(vecAcc4);
+          dvecAccLH = IVP_CVT64SNX48LH(vecAcc4);
+          dvecAccHL = IVP_CVT64SNX48HL(vecAcc4);
+          dvecAccHH = IVP_CVT64SNX48HH(vecAcc4);
+          // Storing 32 elements at W = 1, H = 1 initially
+          pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remX * accDataPitch1) + (remY * accDataPitch2));
+          IVP_SAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh));
+          IVP_SAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaAcc, pdvecAccData);
+        }
+      } // End of for (x = 0; x < outWidth; x += 2)
+    }   // End of for (y = 0; y < outHeight; y += 2)
+  }     // End of for (outCh = 0; outCh < numOutCh; outCh += XCHAL_IVPN_SIMD_WIDTH)
+}
+
+/***************** xaiPartialConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH *****************/
+/****************** xaiPartialConvolved3D_S_MxN_S16S16I16_MOD_DWH ******************/
+/***********************************************************************************/
+/* Description : Optimized implementation for partial 3D convolution               */
+/* Inputs      : Input Data Tile, Coeff Data Tile, Bias Array, Output Scale Array, */
+/*               CNN convolution params structure                                  */
+/* Outputs     : XI Error Code                                                     */
+/* InOuts      : Accumulator Tile, Output Tile                                     */
+/* Assumptions : InData, CoeffData are S16                                         */
+/*               OutData is U16 / S16                                              */
+/*               Input is in DWH and Output is in DWH format                       */
+/*               Coeff is in NDWH format                                           */
+/*               Accumulated value will be within 48-bit range                     */
+/***********************************************************************************/
+#ifdef DILATED_VQ_CONV_PARTIAL
+XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile,
+                                                             const xai_pTile4D coeffTile,
+                                                             const xai_pArray biasArray,
+                                                             const xai_pArray outputScaleArray,
+                                                             xai_pTile3D accTile,
+                                                             xai_pTile3D outTile,
+                                                             const xai_cnn_conv_params *param)
+#else
+XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile,
+                                                           const xai_pTile4D coeffTile,
+                                                           const xai_pArray biasArray,
+                                                           xai_pTile3D accTile,
+                                                           xai_pTile3D outTile,
+                                                           const xai_cnn_conv_params *param)
+#endif
+{
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S16(inTile);
+    XAI_CHECK_TILE4D_S16(coeffTile);
+    XAI_CHECK_POINTER(biasArray);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile);
+    XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE,   \
+                    "\nKernel height = %d and width = %d\nKernel width and height should be less than or equal to 64", \
+                    XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile));
+    XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param);
+    XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) &&                                      \
+                    ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG,                    \
+                    "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) == 1) ||                                                          \
+                    ((XAI_CNN_CONV_GET_DILATION(param) >= 1) &&                                                         \
+                     (XAI_CNN_CONV_GET_STRIDEX(param) == 1) && (XAI_CNN_CONV_GET_STRIDEY(param) == 1)), XAI_ERR_BADARG, \
+                    "\nDilation = %hhu\nDilation should be 1. It can be greater than 1 only when stride is equal to 1", \
+                    XAI_CNN_CONV_GET_DILATION(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                                             \
+                    XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH);
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 32,                                     \
+                    XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_ACCUM_SHIFT(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32,                               \
+                    XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \
+                    XAI_CNN_CONV_GET_OUTPUT_SHIFT(param));
+    XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile);
+#ifdef DILATED_VQ_CONV_PARTIAL
+    XAI_CHECK_ARRAY_U16(outputScaleArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE,                                                      \
+                    "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \
+                    XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile));
+#endif
+    XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param);
+
+    if (XAI_CNN_CONV_GET_FLAG_INPUT(param))
+    {
+      XAI_CHECK_ARRAY_S64(biasArray);
+    }
+    if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+    {
+      XAI_CHECK_TILE3D_S64(accTile);
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(accTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, accTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, accTile);
+      XAI_CHECK_TILE3D_DATA_ORDER(accTile, XAI_DWH);
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(accTile) >= XAI_TILE3D_GET_DIM1(outTile)), XAI_ERR_DATASIZE,         \
+                      "\ndim1Size of accTile = %d, should be greater than or equal to %d(dim1Size of outTile)", \
+                      XAI_TILE3D_GET_DIM1(accTile), XAI_TILE3D_GET_DIM1(outTile));
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(accTile) >= XAI_TILE3D_GET_DIM2(outTile)), XAI_ERR_DATASIZE,         \
+                      "\ndim2Size of accTile = %d, should be greater than or equal to %d(dim2Size of outTile)", \
+                      XAI_TILE3D_GET_DIM2(accTile), XAI_TILE3D_GET_DIM2(outTile));
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(accTile) >= XAI_TILE3D_GET_DIM3(outTile)), XAI_ERR_DATASIZE,         \
+                      "\ndim3Size of accTile = %d, should be greater than or equal to %d(dim3Size of outTile)", \
+                      XAI_TILE3D_GET_DIM3(accTile), XAI_TILE3D_GET_DIM3(outTile));
+    }
+    if (XAI_CNN_CONV_GET_FLAG_OUTPUT(param))
+    {
+      XAI_CHECK_ERROR(XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16), \
+                      XAI_ERR_DATATYPE, "\nOutTile data type need to be either XAI_S16 or XAI_U16");
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile);
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile);
+      if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param)))
+      {
+        XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(accTile, outTile);
+      }
+    }
+  }
+
+  const uint8_t dilationU = XAI_CNN_CONV_GET_DILATION(param);
+
+  /* Calling further optimized variant based on certain conditions */
+  if ((XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile)) && (dilationU == 1))
+  {
+#ifdef DILATED_VQ_CONV_PARTIAL
+    partialConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth(inTile, coeffTile, biasArray, outputScaleArray, \
+                                                                  accTile, outTile, param);
+#else
+    partialConvolved3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth(inTile, coeffTile, biasArray, \
+                                                                accTile, outTile, param);
+#endif
+
+    return(XAI_ERROR_STATUS());
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t numInCh         = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t numOutCh        = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t outWidth        = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t outHeight       = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t inDataPitch1    = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inDataPitch2    = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outDataPitch1   = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2   = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+  const int32_t coeffDataPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffDataPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffDataPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+  const int32_t kWidthU         = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeightU        = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  /* Convolution params */
+  const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param);
+  const uint8_t outShiftU     = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param);
+  const uint8_t enableReLu    = XAI_CNN_CONV_GET_FLAG_RELU(param);
+  const uint8_t strideX       = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY       = XAI_CNN_CONV_GET_STRIDEY(param);
+  const uint8_t leftEdgeFlag  = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param);
+  const uint8_t topEdgeFlag   = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param);
+  const uint8_t inputFlag     = XAI_CNN_CONV_GET_FLAG_INPUT(param);
+  const uint8_t outputFlag    = XAI_CNN_CONV_GET_FLAG_OUTPUT(param);
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  const uint16_t *pOutputScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray);
+#else
+  const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param);
+#endif
+
+  /* Data Pointers of input, coefficient, biasData */
+  const int16_t *pInData    = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  const int16_t *pCoeffData = (int16_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  const int64_t *pBiasData  = (int64_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  /* Data Pointers of output and scratch buffer data */
+  int16_t *pOutData = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int64_t *pAccData = NULL;
+
+  int32_t accDataPitch1 = 0;
+  int32_t accDataPitch2 = 0;
+
+  if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param)))
+  {
+    pAccData      = (int64_t *) XAI_TILE3D_GET_DATA_PTR(accTile);
+    accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile);
+    accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile);
+  }
+
+  int32_t dilatedKWidthU  = dilationU * (kWidthU - 1) + 1;
+  int32_t dilatedKHeightU = dilationU * (kHeightU - 1) + 1;
+  int32_t leftEdge, topEdge;
+
+  if ((dilatedKWidthU % 2) != 0)
+  {
+    leftEdge = dilatedKWidthU / 2;
+  }
+  else
+  {
+    leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1);
+  }
+
+  if ((dilatedKHeightU % 2) != 0)
+  {
+    topEdge = dilatedKHeightU / 2;
+  }
+  else
+  {
+    topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1);
+  }
+
+  /* move to start of edge data only when input is already padded. */
+  pInData = &pInData[-(int32_t) ((topEdge) * inDataPitch2 + (leftEdge) * inDataPitch1)];
+
+  /* Setting the limits for output data according to ReLu is enabled or not*/
+  int32_t minLim, maxLim;
+
+  if (enableReLu)
+  {
+    minLim = XAI_CNN_CONV_GET_RELU_MIN(param);
+    maxLim = XAI_CNN_CONV_GET_RELU_MAX(param);
+  }
+  else
+  {
+    minLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MIN : 0);
+    maxLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX : USHRT_MAX);
+  }
+
+  int32_t inCh, outCh, x, y, k;
+
+  xb_vecN_2x32v *restrict phvecIn1;
+  xb_vecN_2x32v *restrict phvecIn2;
+  xb_vecN_2x32v *restrict phvecIn3;
+  xb_vecN_2x32v *restrict phvecIn4;
+  xb_vecNx16 *restrict pvecCoeff;
+  xb_vec2Nx8 *restrict pdvecBias;
+  xb_vec2Nx8 *restrict pdvecAccData;
+  xb_vecNx16 *restrict pvecOut;
+
+  xb_vecNx48 vecAcc1 = 0, vecAcc2 = 0, vecAcc3 = 0, vecAcc4 = 0, vecBias = 0;
+  xb_vecN_2x32v hvecIn1, hvecIn2, hvecIn3, hvecIn4;
+  xb_vecNx16 vecCoeff1, vecCoeff2;
+  xb_vecNx16 vecOut1, vecOut2, vecOut3, vecOut4;
+  xb_vec2Nx8 dvecAccLL, dvecAccLH, dvecAccHL, dvecAccHH;
+
+  valign vaIn1, vaIn2, vaIn3, vaIn4, vaBias, vaAcc;
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+  xb_vecNx16U vecOutScaleU;
+  xb_vecNx16U *restrict pvecOutScaleData = (xb_vecNx16U *) (pOutputScaleData);
+  valign vaScale                         = IVP_LANX16U_PP(pvecOutScaleData);
+#endif
+
+  pdvecBias = (xb_vec2Nx8 *) (pBiasData);
+  vaBias    = IVP_LA2NX8_PP(pdvecBias);
+  valign vaOut = IVP_ZALIGN();
+
+  for (outCh = 0; outCh < numOutCh; outCh += XCHAL_IVPN_SIMD_WIDTH)
+  {
+    int32_t remOutCh = (numOutCh - outCh);
+    /* Initially the accumulators with the 48-bit bias values */
+    if (inputFlag) // Biases will be loaded only when "inputFlag" is set
+    {
+      ACC_INIT_BIAS64_MOD_ONEACC(pdvecBias, vaBias, remOutCh, vecBias);
+    }
+
+#ifdef DILATED_VQ_CONV_PARTIAL
+    IVP_LAVNX16U_XP(vecOutScaleU, vaScale, pvecOutScaleData, 2 * remOutCh);
+#endif
+
+    for (y = 0; y < outHeight; y += 2)
+    {
+      // Calculating "remY" for integrated tail-handling purpose
+      int32_t remY = XT_MIN(1, outHeight - y - 1);
+      for (x = 0; x < outWidth; x += 2)
+      {
+        // Calculating "remX" for integrated tail-handling purpose
+        int32_t remX    = XT_MIN(1, outWidth - x - 1);
+        int16_t *pData1 = (int16_t *) (pInData + (x * strideX * inDataPitch1) + (y * strideY * inDataPitch2));
+        int64_t *pAcc   = (int64_t *) (pAccData + outCh + (x * accDataPitch1) + (y * accDataPitch2));
+
+        if (inputFlag) // if "inputFlag" is set, then initialize the accumulators with the bias values
+        {
+          /* Initializing all the 4 accumulators with bias values before accumulating for every spatial location */
+          vecAcc4 = vecAcc3 = vecAcc2 = vecAcc1 = vecBias;
+        }
+        else // if "inputFlag" is not-set, then initialize the accumulators with the values stored in the accTile
+        {
+          // Loading accumulated values from W = 0, H = 0 spatial location initially
+          pdvecAccData = (xb_vec2Nx8 *) (pAcc);
+          vaAcc        = IVP_LA2NX8_PP(pdvecAccData);
+          IVP_LAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh));
+          IVP_LAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_LAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_LAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH));
+          vecAcc1 = IVP_CVT48UN_2X64L(dvecAccLH, dvecAccLL);
+          IVP_CVT48UN_2X64H(vecAcc1, dvecAccHH, dvecAccHL);
+
+          // Loading accumulated values form W = 1, H = 0 spatial location initially
+          pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remX * accDataPitch1));
+          vaAcc        = IVP_LA2NX8_PP(pdvecAccData);
+          IVP_LAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh));
+          IVP_LAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_LAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_LAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH));
+          vecAcc2 = IVP_CVT48UN_2X64L(dvecAccLH, dvecAccLL);
+          IVP_CVT48UN_2X64H(vecAcc2, dvecAccHH, dvecAccHL);
+
+          // Loading accumulated values form W = 0, H = 1 spatial location initially
+          pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remY * accDataPitch2));
+          vaAcc        = IVP_LA2NX8_PP(pdvecAccData);
+          IVP_LAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh));
+          IVP_LAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_LAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_LAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH));
+          vecAcc3 = IVP_CVT48UN_2X64L(dvecAccLH, dvecAccLL);
+          IVP_CVT48UN_2X64H(vecAcc3, dvecAccHH, dvecAccHL);
+
+          // Loading accumulated values form W = 1, H = 1 spatial location initially
+          pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remX * accDataPitch1) + (remY * accDataPitch2));
+          vaAcc        = IVP_LA2NX8_PP(pdvecAccData);
+          IVP_LAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh));
+          IVP_LAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_LAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_LAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH));
+          vecAcc4 = IVP_CVT48UN_2X64L(dvecAccLH, dvecAccLL);
+          IVP_CVT48UN_2X64H(vecAcc4, dvecAccHH, dvecAccHL);
+        }
+
+        for (k = 0; k < kWidthU * kHeightU; k++)
+        {
+          // Adjusting the coefficient data pointer
+          pvecCoeff = (xb_vecNx16 *) (pCoeffData + outCh + ((k % kWidthU) * coeffDataPitch2) + ((k / kWidthU) * coeffDataPitch3));
+          int16_t *pData2 = (int16_t *) (pData1 + (((k % kWidthU) * dilationU) * inDataPitch1) + (((k / kWidthU) * dilationU) * inDataPitch2));
+          // phvecIn1 initially points to W = 0, H = 0 spatial location
+          phvecIn1 = (xb_vecN_2x32v *) (pData2);
+          // phvecIn2 initially points to W = 1, H = 0 spatial location
+          phvecIn2 = (xb_vecN_2x32v *) (pData2 + (remX * strideX * inDataPitch1));
+          // phvecIn3 initially points to W = 0, H = 1 spatial location
+          phvecIn3 = (xb_vecN_2x32v *) (pData2 + (remY * strideY * inDataPitch2));
+          // phvecIn4 initially points to W = 1, H = 1 spatial location
+          phvecIn4 = (xb_vecN_2x32v *) (pData2 + (remX * strideX * inDataPitch1) + (remY * strideY * inDataPitch2));
+
+          vaIn1 = IVP_LAN_2X32_PP(phvecIn1);
+          vaIn2 = IVP_LAN_2X32_PP(phvecIn2);
+          vaIn3 = IVP_LAN_2X32_PP(phvecIn3);
+          vaIn4 = IVP_LAN_2X32_PP(phvecIn4);
+
+          for (inCh = 0; inCh < (numInCh - 1); inCh += 2)
+          {
+            // hvecIn1 contains 4 bytes or 2 elements along D from W = 0, H = 0 spatial location initially
+            IVP_LAVN_2X32_XP(hvecIn1, vaIn1, phvecIn1, 4);
+            // hvecIn2 contains 4 bytes or 2 elements along D from W = 1, H = 0 spatial location initially
+            IVP_LAVN_2X32_XP(hvecIn2, vaIn2, phvecIn2, 4);
+            // hvecIn2 contains 4 bytes or 2 elements along D from W = 0, H = 1 spatial location initially
+            IVP_LAVN_2X32_XP(hvecIn3, vaIn3, phvecIn3, 4);
+            // hvecIn2 contains 4 bytes or 2 elements along D from W = 1, H = 1 spatial location initially
+            IVP_LAVN_2X32_XP(hvecIn4, vaIn4, phvecIn4, 4);
+
+            // vecCoeff1 contains 64 bytes or 32 elements along output depth (N) from initial input depth (D = 0) initially
+            IVP_L2UNX16_XP(vecCoeff1, pvecCoeff, (2 * coeffDataPitch1));
+            // vecCoeff2 contains 64 bytes or 32 elements along output depth (N) from next input depth (D = 1) initially
+            IVP_L2UNX16_XP(vecCoeff2, pvecCoeff, (2 * coeffDataPitch1));
+
+            // vecAcc1 contains 64 bytes or 32 elements along output depth (N) from W = 0, H = 0 spatial location initially
+            IVP_MULPAN16XR16(vecAcc1, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecIn1, 0));
+            // vecAcc2 contains 64 bytes or 32 elements along output depth (N) from W = 1, H = 0 spatial location initially
+            IVP_MULPAN16XR16(vecAcc2, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecIn2, 0));
+            // vecAcc3 contains 64 bytes or 32 elements along output depth (N) from W = 0, H = 1 spatial location initially
+            IVP_MULPAN16XR16(vecAcc3, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecIn3, 0));
+            // vecAcc4 contains 64 bytes or 32 elements along output depth (N) from W = 1, H = 1 spatial location initially
+            IVP_MULPAN16XR16(vecAcc4, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecIn4, 0));
+          } // End of for (inCh = 0; inCh < numInCh; inCh += 2)
+
+          if (inCh < numInCh)
+          {
+            // hvecIn1 contains 2 bytes or 1 element along D from W = 0, H = 0 spatial location initially
+            IVP_LAVN_2X32_XP(hvecIn1, vaIn1, phvecIn1, 2);
+            // hvecIn2 contains 2 bytes or 1 element along D from W = 1, H = 0 spatial location initially
+            IVP_LAVN_2X32_XP(hvecIn2, vaIn2, phvecIn2, 2);
+            // hvecIn2 contains 2 bytes or 1 element along D from W = 0, H = 1 spatial location initially
+            IVP_LAVN_2X32_XP(hvecIn3, vaIn3, phvecIn3, 2);
+            // hvecIn2 contains 2 bytes or 1 element along D from W = 1, H = 1 spatial location initially
+            IVP_LAVN_2X32_XP(hvecIn4, vaIn4, phvecIn4, 2);
+
+            // vecCoeff1 contains 64 bytes or 32 elements along output depth (N) from initial input depth (D = 0)
+            IVP_L2UNX16_XP(vecCoeff1, pvecCoeff, (2 * coeffDataPitch1));
+
+            // vecAcc1 contains 64 bytes or 32 elements along output depth (N) from W = 0, H = 0 spatial location initially
+            IVP_MULPAN16XR16(vecAcc1, 0, vecCoeff1, IVP_EXTRN_2X32(hvecIn1, 0));
+            // vecAcc2 contains 64 bytes or 32 elements along output depth (N) from W = 1, H = 0 spatial location initially
+            IVP_MULPAN16XR16(vecAcc2, 0, vecCoeff1, IVP_EXTRN_2X32(hvecIn2, 0));
+            // vecAcc3 contains 64 bytes or 32 elements along output depth (N) from W = 0, H = 1 spatial location initially
+            IVP_MULPAN16XR16(vecAcc3, 0, vecCoeff1, IVP_EXTRN_2X32(hvecIn3, 0));
+            // vecAcc4 contains 64 bytes or 32 elements along output depth (N) from W = 1, H = 1 spatial location initially
+            IVP_MULPAN16XR16(vecAcc4, 0, vecCoeff1, IVP_EXTRN_2X32(hvecIn4, 0));
+          }
+        }   // End of for (k = 0; k < kWidthU * kHeightU; k++)
+
+        if (outputFlag) // if "outputFlag" is set, apply pack, scale, shift, clamp logic on accumulated values and store the output
+        {
+          /* Pack, scale, shift, clamp logic to follow */
+#ifdef DILATED_VQ_CONV_PARTIAL
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut1, vecAcc1, packShiftAccU, vecOutScaleU, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut2, vecAcc2, packShiftAccU, vecOutScaleU, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut3, vecAcc3, packShiftAccU, vecOutScaleU, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut4, vecAcc4, packShiftAccU, vecOutScaleU, outShiftU, minLim, maxLim);
+#else
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1, vecAcc1, packShiftAccU, outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2, vecAcc2, packShiftAccU, outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3, vecAcc3, packShiftAccU, outScale, outShiftU, minLim, maxLim);
+          PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4, vecAcc4, packShiftAccU, outScale, outShiftU, minLim, maxLim);
+#endif
+          // Storing 64 bytes or 32 elements along output depth (N) at W = 0, H = 0 initially
+          pvecOut = (xb_vecNx16 *) (pOutData + outCh + (x * outDataPitch1) + (y * outDataPitch2));
+          IVP_SAVNX16_XP(vecOut1, vaOut, pvecOut, (2 * remOutCh));
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+
+          // Storing 64 bytes or 32 elements along output depth (N) at W = 1, H = 0 initially
+          pvecOut = (xb_vecNx16 *) (pOutData + outCh + ((x + remX) * outDataPitch1) + (y * outDataPitch2));
+          IVP_SAVNX16_XP(vecOut2, vaOut, pvecOut, (2 * remOutCh) * remX);
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+
+          // Storing 64 bytes or 32 elements along output depth (N) at W = 0, H = 1 initially
+          pvecOut = (xb_vecNx16 *) (pOutData + outCh + (x * outDataPitch1) + ((y + remY) * outDataPitch2));
+          IVP_SAVNX16_XP(vecOut3, vaOut, pvecOut, (2 * remOutCh) * remY);
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+
+          // Storing 64 bytes or 32 elements along output depth (N) at W = 1, H = 1 initially
+          pvecOut = (xb_vecNx16 *) (pOutData + outCh + ((x + remX) * outDataPitch1) + ((y + remY) * outDataPitch2));
+          IVP_SAVNX16_XP(vecOut4, vaOut, pvecOut, (2 * remOutCh) * remX * remY);
+          IVP_SAPOSNX16_FP(vaOut, pvecOut);
+        }
+        else // if "outputFlag" is not-set, store the accumulated values to the accTile
+        {
+          vaAcc     = IVP_ZALIGN();
+          dvecAccLL = IVP_CVT64SNX48LL(vecAcc1);
+          dvecAccLH = IVP_CVT64SNX48LH(vecAcc1);
+          dvecAccHL = IVP_CVT64SNX48HL(vecAcc1);
+          dvecAccHH = IVP_CVT64SNX48HH(vecAcc1);
+          // Storing 32 elements at W = 0, H = 0 initially
+          pdvecAccData = (xb_vec2Nx8 *) (pAcc);
+          IVP_SAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh));
+          IVP_SAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaAcc, pdvecAccData);
+
+          dvecAccLL = IVP_CVT64SNX48LL(vecAcc2);
+          dvecAccLH = IVP_CVT64SNX48LH(vecAcc2);
+          dvecAccHL = IVP_CVT64SNX48HL(vecAcc2);
+          dvecAccHH = IVP_CVT64SNX48HH(vecAcc2);
+          // Storing 32 elements at W = 1, H = 0 initially
+          pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remX * accDataPitch1));
+          IVP_SAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh));
+          IVP_SAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaAcc, pdvecAccData);
+
+          dvecAccLL = IVP_CVT64SNX48LL(vecAcc3);
+          dvecAccLH = IVP_CVT64SNX48LH(vecAcc3);
+          dvecAccHL = IVP_CVT64SNX48HL(vecAcc3);
+          dvecAccHH = IVP_CVT64SNX48HH(vecAcc3);
+          // Storing 32 elements at W = 0, H = 1 initially
+          pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remY * accDataPitch2));
+          IVP_SAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh));
+          IVP_SAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaAcc, pdvecAccData);
+
+          dvecAccLL = IVP_CVT64SNX48LL(vecAcc4);
+          dvecAccLH = IVP_CVT64SNX48LH(vecAcc4);
+          dvecAccHL = IVP_CVT64SNX48HL(vecAcc4);
+          dvecAccHH = IVP_CVT64SNX48HH(vecAcc4);
+          // Storing 32 elements at W = 1, H = 1 initially
+          pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remX * accDataPitch1) + (remY * accDataPitch2));
+          IVP_SAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh));
+          IVP_SAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH));
+          IVP_SAPOS2NX8_FP(vaAcc, pdvecAccData);
+        }
+      } // End of for (x = 0; x < outWidth; x += 2)
+    }   // End of for (y = 0; y < outHeight; y += 2)
+  }     // End of for (outCh = 0; outCh < numOutCh; outCh += XCHAL_IVPN_SIMD_WIDTH)
+
+  return(XAI_ERROR_STATUS());
+}
+#endif // #if ((XCHAL_VISION_TYPE >= 6))
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_extend_edge.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_extend_edge.h
new file mode 100644
index 00000000000..fe3bb154328
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_extend_edge.h
@@ -0,0 +1,1517 @@
+/*
+ * Copyright (c) 2022 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER_IDT)               name ## _ ## MORPH_FNAME_SPECIFIER_IDT
+#define MAKE_NAME_IMPL_1(name, MORPH_FNAME_SPECIFIER_IDT, dataOrder)  name ## _ ## MORPH_FNAME_SPECIFIER_IDT ## _ ## dataOrder
+
+#if INPUT_DATA_TYPE == INTEGER8BIT
+
+#undef MAKE_ARGUMENTS
+#undef MAKE_ARGUMENTS2
+#undef MAKE_NAME
+#undef MAKE_NAME_1
+#undef MORPH_OP_FUNCTION
+#undef MORPH_OP_FUNCTION_CONST
+#undef MORPH_IDT_CHECK
+#undef MORPH_ADT_CHECK
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_FILLTILE
+#undef MORPH_OP_LOAD
+#undef MORPH_OP_AND
+#undef MORPH_OP_SEQ
+#undef MORPH_OP_SEL
+#undef MORPH_OP_STORE
+#undef MORPH_OP_PRIME
+#undef MORPH_IDT_VEC
+#undef MORPH_OP_FLUSH
+#undef MORPH_VECTORIZATIONWIDTH
+
+#define MAKE_ARGUMENTS(a, b, c)   (xai_pTile3D a, const int32_t b, xai_size3D c)
+#define MAKE_ARGUMENTS2(a, b, c)  (xai_pTile3D a, const int8_t * b, xai_size3D c)
+#define MORPH_OP_FUNCTION        extendWHEdges3D_I8
+#define MORPH_OP_FUNCTION_CONST  extendEdgesConst3D_I8
+#define MAKE_NAME(name)               MAKE_NAME_IMPL(name, I8)
+#define MAKE_NAME_1(name, dataOrder)  MAKE_NAME_IMPL_1(name, I8, dataOrder)
+#define MORPH_IDT_CHECK           XAI_CHECK_TILE3D_I8
+#define MORPH_ADT_CHECK           XAI_CHECK_ARRAY_I8
+#define MORPH_IDT_SCALAR          int8_t
+#define MORPH_IDT_FILLTILE        xaiFillTile3D_I8
+#define MORPH_OP_LOAD             IVP_LAV2NX8_XP
+#define MORPH_OP_AND              IVP_AND2NX8
+#define MORPH_OP_SEQ              IVP_SEQ2NX8
+#define MORPH_OP_SEL              IVP_SEL2NX8
+#define MORPH_OP_STORE            IVP_SAV2NX8_XP
+#define MORPH_OP_PRIME            IVP_LA2NX8_PP
+#define MORPH_IDT_VEC             xb_vec2Nx8
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8_FP
+#define MORPH_VECTORIZATIONWIDTH  2 * XCHAL_IVPN_SIMD_WIDTH
+
+#elif INPUT_DATA_TYPE == INTEGER16BIT
+
+#undef MAKE_ARGUMENTS
+#undef MAKE_ARGUMENTS2
+#undef MAKE_NAME
+#undef MAKE_NAME_1
+#undef MORPH_OP_FUNCTION
+#undef MORPH_OP_FUNCTION_CONST
+#undef MORPH_IDT_CHECK
+#undef MORPH_ADT_CHECK
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_FILLTILE
+#undef MORPH_OP_LOAD
+#undef MORPH_OP_AND
+#undef MORPH_OP_SEQ
+#undef MORPH_OP_SEL
+#undef MORPH_OP_STORE
+#undef MORPH_OP_PRIME
+#undef MORPH_IDT_VEC
+#undef MORPH_OP_FLUSH
+#undef MORPH_VECTORIZATIONWIDTH
+
+#define MAKE_ARGUMENTS(a, b, c)       (xai_pTile3D a, const int32_t b, xai_size3D c)
+#define MAKE_ARGUMENTS2(a, b, c)      (xai_pTile3D a, const int16_t * b, xai_size3D c)
+#define MAKE_NAME(name)               MAKE_NAME_IMPL(name, I16)
+#define MAKE_NAME_1(name, dataOrder)  MAKE_NAME_IMPL_1(name, I16, dataOrder)
+#define MORPH_OP_FUNCTION         extendWHEdges3D_I16
+#define MORPH_OP_FUNCTION_CONST   extendEdgesConst3D_I16
+#define MORPH_IDT_CHECK           XAI_CHECK_TILE3D_I16
+#define MORPH_ADT_CHECK           XAI_CHECK_ARRAY_I16
+#define MORPH_IDT_SCALAR          int16_t
+#define MORPH_IDT_FILLTILE        xaiFillTile3D_I16
+#define MORPH_OP_LOAD             IVP_LAVNX16_XP
+#define MORPH_OP_AND              IVP_ANDNX16
+#define MORPH_OP_SEQ              IVP_SEQNX16
+#define MORPH_OP_SEL              IVP_SELNX16
+#define MORPH_OP_STORE            IVP_SAVNX16_XP
+#define MORPH_OP_PRIME            IVP_LANX16_PP
+#define MORPH_IDT_VEC             xb_vecNx16
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16_FP
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH
+
+#elif INPUT_DATA_TYPE == FLOAT16BIT
+#undef MAKE_ARGUMENTS
+#undef MAKE_ARGUMENTS2
+#undef MAKE_NAME
+#undef MAKE_NAME_1
+#undef MORPH_OP_FUNCTION
+#undef MORPH_OP_FUNCTION_CONST
+#undef MORPH_IDT_CHECK
+#undef MORPH_ADT_CHECK
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_FILLTILE
+#undef MORPH_OP_LOAD
+#undef MORPH_OP_STORE
+#undef MORPH_OP_PRIME
+#undef MORPH_IDT_VEC
+#undef MORPH_OP_FLUSH
+#undef MORPH_VECTORIZATIONWIDTH
+
+#if (XCHAL_HAVE_VISION_HP_VFPU == 1)
+#define MAKE_ARGUMENTS(a, b, c)   (xai_pTile3D a, const xb_f16 b, xai_size3D c)
+#define MAKE_ARGUMENTS2(a, b, c)  (xai_pTile3D a, const xb_f16 * b, xai_size3D c)
+#define MORPH_OP_FUNCTION        extendWHEdges3D_F16
+#define MORPH_OP_FUNCTION_CONST  extendEdgesConst3D_F16
+#define MAKE_NAME(name)               MAKE_NAME_IMPL(name, F16)
+#define MAKE_NAME_1(name, dataOrder)  MAKE_NAME_IMPL_1(name, F16, dataOrder)
+#define MORPH_IDT_CHECK           XAI_CHECK_TILE3D_F16
+#define MORPH_ADT_CHECK           XAI_CHECK_ARRAY_F16
+#define MORPH_IDT_SCALAR          xb_f16
+#define MORPH_IDT_FILLTILE        xaiFillTile3D_F16
+#define MORPH_OP_LOAD             IVP_LAVNXF16_XP
+#define MORPH_OP_STORE            IVP_SAVNXF16_XP
+#define MORPH_OP_PRIME            IVP_LANXF16_PP
+#define MORPH_IDT_VEC             xb_vecNxf16
+#define MORPH_OP_FLUSH            IVP_SAPOSNXF16_FP
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH
+#endif
+
+#elif INPUT_DATA_TYPE == FLOAT32BIT
+#undef MAKE_ARGUMENTS
+#undef MAKE_ARGUMENTS2
+#undef MAKE_NAME
+#undef MAKE_NAME_1
+#undef MORPH_OP_FUNCTION
+#undef MORPH_OP_FUNCTION_CONST
+#undef MORPH_IDT_CHECK
+#undef MORPH_ADT_CHECK
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_FILLTILE
+#undef MORPH_OP_LOAD
+#undef MORPH_OP_STORE
+#undef MORPH_OP_PRIME
+#undef MORPH_IDT_VEC
+#undef MORPH_OP_FLUSH
+#undef MORPH_VECTORIZATIONWIDTH
+
+#if (XCHAL_HAVE_VISION_SP_VFPU == 1)
+#define MAKE_ARGUMENTS(a, b, c)   (xai_pTile3D a, const float b, xai_size3D c)
+#define MAKE_ARGUMENTS2(a, b, c)  (xai_pTile3D a, const float * b, xai_size3D c)
+#define MORPH_OP_FUNCTION        extendWHEdges3D_F32
+#define MORPH_OP_FUNCTION_CONST  extendEdgesConst3D_F32
+#define MAKE_NAME(name)               MAKE_NAME_IMPL(name, F32)
+#define MAKE_NAME_1(name, dataOrder)  MAKE_NAME_IMPL_1(name, F32, dataOrder)
+#define MORPH_IDT_CHECK           XAI_CHECK_TILE3D_F32
+#define MORPH_ADT_CHECK           XAI_CHECK_ARRAY_F32
+#define MORPH_IDT_SCALAR          float
+#define MORPH_IDT_FILLTILE        xaiFillTile3D_F32
+#define MORPH_OP_LOAD             IVP_LAVN_2XF32_XP
+#define MORPH_OP_STORE            IVP_SAVN_2XF32_XP
+#define MORPH_OP_PRIME            IVP_LAN_2XF32_PP
+#define MORPH_IDT_VEC             xb_vecN_2xf32
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2XF32_FP
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#endif
+#endif
+
+
+/*====================================================================================*/
+/*============= START of xaiExtendEdgesConst3D_* routines ============================*/
+/*====================================================================================*/
+
+/*************************** extendEdgesConst3D_I8  *************************/
+/*************************** extendEdgesConst3D_I16 *************************/
+/*************************** extendEdgesConst3D_F16 *************************/
+/*************************** extendEdgesConst3D_F32 *************************/
+/* Description : P6 implementation for extending the edges of a 3D tile     */
+/*               with a constant value. This function extends edges across  */
+/*               dimension 1 & dimension2 of  a 3D tile                     */
+/* Inputs      : constant value to fill the edges                           */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Destination Tile                                           */
+/* Assumptions : dstData is signed 8/16 bit Interger or half precision      */
+/*               float(FP16) or single precision float(FP32)                */
+/*               based on MORPH specifier.                                  */
+/****************************************************************************/
+static _XAI_INLINE_ void MAKE_NAME(extendEdgesConst3D) MAKE_ARGUMENTS(dstTile, constValue, frame3DSize)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t dim1Size  = XAI_TILE3D_GET_DIM1(dstTile);
+  const int32_t dim2Size  = XAI_TILE3D_GET_DIM2(dstTile);
+  const int32_t dim1Edge1 = XAI_TILE3D_GET_DIM1_EDGE1(dstTile);
+  const int32_t dim1Edge2 = XAI_TILE3D_GET_DIM1_EDGE2(dstTile);
+  const int32_t dim2Edge1 = XAI_TILE3D_GET_DIM2_EDGE1(dstTile);
+  const int32_t dim2Edge2 = XAI_TILE3D_GET_DIM2_EDGE2(dstTile);
+  int32_t dim3Size        = XAI_TILE3D_GET_DIM3(dstTile);
+
+  const int32_t dstDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(dstTile);
+  const int32_t dstDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(dstTile);
+  int32_t frame_dim1          = frame3DSize.dim1Size;
+  int32_t frame_dim2          = frame3DSize.dim2Size;
+  int32_t dim1ExtendEdgeSize  = dim1Size + dim1Edge1 + dim1Edge2;
+  int32_t dim2ExtendEdgeSize  = (dim2Size + dim2Edge1 + dim2Edge2) * dstDataPitch1;
+
+  int32_t start_x = XAI_TILE3D_GET_DIM1_COORD(dstTile);
+  int32_t start_y = XAI_TILE3D_GET_DIM2_COORD(dstTile);
+
+  MORPH_IDT_SCALAR *restrict pDst3D = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(dstTile);
+  int32_t ixmin                     = MAX2(start_x - dim1Edge1, 0);
+  int32_t ixmax                     = MIN2(start_x + dim1Size + dim1Edge2 - 1, frame_dim1 - 1);
+  int32_t iymin                     = MAX2(start_y - dim2Edge1, 0);
+  int32_t iymax                     = MIN2(start_y + dim2Size + dim2Edge2 - 1, frame_dim2 - 1);
+
+  int x, y, z; /* Loop variables */
+  const MORPH_IDT_SCALAR value = constValue;
+
+  // horizontal top
+  int32_t horTopXcord  = -dim1Edge1;
+  int32_t horTopYcord  = -dim2Edge1;
+  int32_t horTopWidth  = dim1Size + dim1Edge1 + dim1Edge2;
+  int32_t horTopHeight = iymin - (start_y - dim2Edge1);
+
+  // horizontal bottom
+  int32_t horBottomXcord  = -dim1Edge1;
+  int32_t horBottomYcord  = iymax + 1 - start_y;
+  int32_t horBottomWidth  = dim1Size + dim1Edge1 + dim1Edge2;
+  int32_t horBottomHeight = start_y + dim2Size + dim2Edge2 - 1 - iymax;
+
+  // vertical left
+  int32_t verLeftXcord  = -dim1Edge1;
+  int32_t verLeftYcord  = horTopYcord + horTopHeight;
+  int32_t verLeftWidth  = ixmin - (start_x - dim1Edge1);
+  int32_t verLeftHeight = iymax - iymin + 1;
+
+  // vertical right
+  int32_t verRightXcord  = ixmax + 1 - start_x;
+  int32_t verRightYcord  = horTopYcord + horTopHeight;
+  int32_t verRightWidth  = start_x + dim1Size + dim1Edge2 - 1 - ixmax;
+  int32_t verRightHeight = iymax - iymin + 1;
+
+  valign vaOutData1 = IVP_ZALIGN();
+
+  MORPH_IDT_VEC *restrict pdvecOut1, *restrict pdvecOut2;
+  MORPH_IDT_SCALAR *restrict pDst1, *restrict pDst2;
+  /* Most optimal case is when -
+     i. dim1 (including edges) has no extra padding
+     ii. Each plane, i.e. dim1 * dim2 (including edges in both dimensions) has no extra padding
+   */
+  if ((dstDataPitch1 == dim1ExtendEdgeSize) && (dstDataPitch2 == dim2ExtendEdgeSize))
+  {
+    int numIter = horTopWidth * horTopHeight + horBottomWidth * horBottomHeight;
+
+    // horizontal top first(z = 0) plane
+    if (horTopHeight > 0)
+    {
+      pDst1 = (MORPH_IDT_SCALAR *) pDst3D + \
+              ((horTopYcord * dstDataPitch1) + horTopXcord);
+      pdvecOut1 = (MORPH_IDT_VEC *) (pDst1);
+      for (x = 0; x < horTopWidth * horTopHeight; x += MORPH_VECTORIZATIONWIDTH)
+      {
+        MORPH_OP_STORE(value, vaOutData1, pdvecOut1,
+                       sizeof(MORPH_IDT_SCALAR) * (horTopWidth * horTopHeight - x));
+        MORPH_OP_FLUSH(vaOutData1, pdvecOut1);
+      }
+    } //if( horTopHeight > 0)
+    z = 0;
+    if (dim3Size > 1)
+    {
+      for (; z < dim3Size - 1; z++) // In one loop, "horizontal bottom z plane" and "horizontal top (z + 1)" plane is covered
+      {
+        pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \
+                ((horBottomYcord * dstDataPitch1) + horBottomXcord);
+
+        pdvecOut1 = (MORPH_IDT_VEC *) (pDst1);
+        for (x = 0; x < numIter; x += MORPH_VECTORIZATIONWIDTH)
+        {
+          MORPH_OP_STORE(value, vaOutData1, pdvecOut1,
+                         sizeof(MORPH_IDT_SCALAR) * (numIter - x));
+          MORPH_OP_FLUSH(vaOutData1, pdvecOut1);
+        }
+      }
+    }
+
+    // horizontal bottom last(z = dim3Size - 1) plane
+    if (horBottomHeight > 0)
+    {
+      pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \
+              ((horBottomYcord * dstDataPitch1) + horBottomXcord);
+      pdvecOut1 = (MORPH_IDT_VEC *) (pDst1);
+      for (x = 0; x < horBottomWidth * horBottomHeight; x += MORPH_VECTORIZATIONWIDTH)
+      {
+        MORPH_OP_STORE(value, vaOutData1, pdvecOut1,
+                       sizeof(MORPH_IDT_SCALAR) * (horBottomWidth * horBottomHeight - x));
+        MORPH_OP_FLUSH(vaOutData1, pdvecOut1);
+      }
+    }
+  }
+  else
+  {
+    for (z = 0; z < dim3Size; z += 2)
+    {
+      int32_t remZ = XT_SALT(1, dim3Size - z);  //remaining (dim3Size - z) greater than 1, then remZ = 1, else 0
+
+      // horizontal top
+      pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \
+              ((horTopYcord * dstDataPitch1) + horTopXcord);
+      pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \
+              ((horTopYcord * dstDataPitch1) + horTopXcord);
+      if (horTopHeight > 0)
+      {
+        for (x = 0; x < horTopWidth; x += MORPH_VECTORIZATIONWIDTH)
+        {
+          int32_t remX = XT_MIN((horTopWidth - x), MORPH_VECTORIZATIONWIDTH);
+          for (y = 0; y < horTopHeight; y++)
+          {
+            pdvecOut1 = (MORPH_IDT_VEC *) (pDst1 + (y * dstDataPitch1) + x);
+            pdvecOut2 = (MORPH_IDT_VEC *) (pDst2 + (y * dstDataPitch1) + x);
+            MORPH_OP_STORE(value, vaOutData1, pdvecOut1, sizeof(MORPH_IDT_SCALAR) * remX);
+            MORPH_OP_FLUSH(vaOutData1, pdvecOut1);
+            MORPH_OP_STORE(value, vaOutData1, pdvecOut2, sizeof(MORPH_IDT_SCALAR) * remX * remZ);
+            MORPH_OP_FLUSH(vaOutData1, pdvecOut2);
+          }
+        }
+      } //if( horTopHeight > 0)
+
+      // horizontal bottom
+      pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \
+              ((horBottomYcord * dstDataPitch1) + horBottomXcord);
+      pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \
+              ((horBottomYcord * dstDataPitch1) + horBottomXcord);
+      if (horBottomHeight > 0)
+      {
+        for (x = 0; x < horBottomWidth; x += MORPH_VECTORIZATIONWIDTH)
+        {
+          int32_t remX = XT_MIN((horBottomWidth - x), MORPH_VECTORIZATIONWIDTH);
+          for (y = 0; y < horBottomHeight; y++)
+          {
+            pdvecOut1 = (MORPH_IDT_VEC *) (pDst1 + (y * dstDataPitch1) + x);
+            pdvecOut2 = (MORPH_IDT_VEC *) (pDst2 + (y * dstDataPitch1) + x);
+            MORPH_OP_STORE(value, vaOutData1, pdvecOut1, sizeof(MORPH_IDT_SCALAR) * remX);
+            MORPH_OP_FLUSH(vaOutData1, pdvecOut1);
+            MORPH_OP_STORE(value, vaOutData1, pdvecOut2, sizeof(MORPH_IDT_SCALAR) * remX * remZ);
+            MORPH_OP_FLUSH(vaOutData1, pdvecOut2);
+          }
+        }
+      }
+    }
+  }
+
+  for (z = 0; z < dim3Size; z += 2)
+  {
+    int remZ = XT_SALT(1, dim3Size - z);  //remaining (dim3Size - z) greater than 1, then remZ = 1, else 0
+
+    // vertical left
+    pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \
+            ((verLeftYcord * dstDataPitch1) + verLeftXcord);
+    pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \
+            ((verLeftYcord * dstDataPitch1) + verLeftXcord);
+
+    for (x = 0; x < verLeftWidth; x += MORPH_VECTORIZATIONWIDTH)
+    {
+      int32_t remX = XT_MIN((verLeftWidth - x), MORPH_VECTORIZATIONWIDTH);
+      for (y = 0; y < verLeftHeight; y++)
+      {
+        pdvecOut1 = (MORPH_IDT_VEC *) (pDst1 + (y * dstDataPitch1) + x);
+        pdvecOut2 = (MORPH_IDT_VEC *) (pDst2 + (y * dstDataPitch1) + x);
+        MORPH_OP_STORE(value, vaOutData1, pdvecOut1, sizeof(MORPH_IDT_SCALAR) * remX);
+        MORPH_OP_FLUSH(vaOutData1, pdvecOut1);
+        MORPH_OP_STORE(value, vaOutData1, pdvecOut2, sizeof(MORPH_IDT_SCALAR) * remX * remZ);
+        MORPH_OP_FLUSH(vaOutData1, pdvecOut2);
+      }
+    }
+
+    // vertical right
+    pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \
+            ((verRightYcord * dstDataPitch1) + verRightXcord);
+    pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \
+            ((verRightYcord * dstDataPitch1) + verRightXcord);
+
+    for (x = 0; x < verRightWidth; x += MORPH_VECTORIZATIONWIDTH)
+    {
+      int32_t remX = XT_MIN((verRightWidth - x), MORPH_VECTORIZATIONWIDTH);
+      for (y = 0; y < verRightHeight; y++)
+      {
+        pdvecOut1 = (MORPH_IDT_VEC *) (pDst1 + (y * dstDataPitch1) + x);
+        pdvecOut2 = (MORPH_IDT_VEC *) (pDst2 + (y * dstDataPitch1) + x);
+        MORPH_OP_STORE(value, vaOutData1, pdvecOut1, sizeof(MORPH_IDT_SCALAR) * remX);
+        MORPH_OP_FLUSH(vaOutData1, pdvecOut1);
+        MORPH_OP_STORE(value, vaOutData1, pdvecOut2, sizeof(MORPH_IDT_SCALAR) * remX * remZ);
+        MORPH_OP_FLUSH(vaOutData1, pdvecOut2);
+      }
+    }
+  }
+}
+
+/************************** xaiExtendEdgesConst3D_I8 ********************************/
+/************************** xaiExtendEdgesConst3D_I16 *******************************/
+/************************** xaiExtendEdgesConst3D_F16 *******************************/
+/************************** xaiExtendEdgesConst3D_F32 *******************************/
+/* Description : P6 optimized generic implementation of xaiExtendEdgesConst 3D      */
+/*               function. Based on MORPH pre-processor specifiers, code            */
+/*               implementation is generated during preprocessing stage. This       */
+/*               method implements xaiExtendEdgesConst_I8, xaiExtendEdgesConst_I16  */
+/*               xaiExtendEdgesConst3D_F16 & xaiExtendEdgesConst3D_F32 functionality*/
+/* Inputs      : constant value to fill the edges                                   */
+/* Outputs     : XI Error Code                                                      */
+/* InOuts      : Destination Tile                                                   */
+/* Assumptions : OutData is signed 8/16 bit Interger or half precision float(FP16)  */
+/*               single precision float(FP32) based on MORPH specifier              */
+/************************************************************************************/
+
+XAI_ERR_TYPE MAKE_NAME(xaiExtendEdgesConst3D) MAKE_ARGUMENTS(dstTile, value, frame3DSize)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(dstTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(dstTile);
+    XAI_CHECK_ERROR((frame3DSize.dim1Size > 0) && (frame3DSize.dim2Size > 0) &&                                                               \
+                    (frame3DSize.dim3Size > 0), XAI_ERR_DATASIZE,                                                                             \
+                    "\nframe3DSize.dim1Size = %d, frame3DSize.dim2Size = %d, frame3DSize.dim3Size = %d\nDimensions should be greater than 0", \
+                    frame3DSize.dim1Size, frame3DSize.dim2Size, frame3DSize.dim3Size);
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(dstTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(dstTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(dstTile);
+  const int32_t dim1Edge1     = XAI_TILE3D_GET_DIM1_EDGE1(dstTile);
+  const int32_t dim1Edge2     = XAI_TILE3D_GET_DIM1_EDGE2(dstTile);
+  const int32_t dim2Edge1     = XAI_TILE3D_GET_DIM2_EDGE1(dstTile);
+  const int32_t dim2Edge2     = XAI_TILE3D_GET_DIM2_EDGE2(dstTile);
+  const int32_t dim3Edge1     = XAI_TILE3D_GET_DIM3_EDGE1(dstTile);
+  const int32_t dim3Edge2     = XAI_TILE3D_GET_DIM3_EDGE2(dstTile);
+  const int32_t dstDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(dstTile);
+  const int32_t dstDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(dstTile);
+
+  MORPH_IDT_SCALAR *pDst = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(dstTile);
+
+  int32_t frame_dim1 = frame3DSize.dim1Size;
+  int32_t frame_dim2 = frame3DSize.dim2Size;
+  int32_t frame_dim3 = frame3DSize.dim3Size;
+  int32_t start_x    = XAI_TILE3D_GET_DIM1_COORD(dstTile);
+  int32_t start_y    = XAI_TILE3D_GET_DIM2_COORD(dstTile);
+  int32_t start_z    = XAI_TILE3D_GET_DIM3_COORD(dstTile);
+
+  int32_t ixmin = MAX2(start_x - dim1Edge1, 0);
+  int32_t ixmax = MIN2(start_x + dim1Size + dim1Edge2 - 1, frame_dim1 - 1);
+  int32_t iymin = MAX2(start_y - dim2Edge1, 0);
+  int32_t iymax = MIN2(start_y + dim2Size + dim2Edge2 - 1, frame_dim2 - 1);
+  int32_t izmin = MAX2(start_z - dim3Edge1, 0);
+  int32_t izmax = MIN2(start_z + dim3Size + dim3Edge2 - 1, frame_dim3 - 1);
+
+  /* nothing to extend, because tile and frame intersection is empty */
+  if ((ixmin > ixmax) || (iymin > iymax) || (izmin > izmax))
+  {
+    return(MORPH_IDT_FILLTILE(dstTile, value, 1));
+  }
+
+  /*******************************************************************************/
+  /* P6 implementation of xaiExtendEdgesConst3D is split into 3 parts.            */
+  /* If pitch is equal to stride, memory location to be updated across 3rd       */
+  /* dimension edges is contiguous. Hence processing across edge can be          */
+  /* implemented using FillTile3D functionality. Processing across 3rd dimension */
+  /* is split as front end and rear end processing. Processing across 3rd        */
+  /* dimension excluding the edge is implemented similar to 2D implementation of */
+  /* ExtendEdges functionality.                                                  */
+  /*******************************************************************************/
+
+  MORPH_IDT_SCALAR *pDst1;
+
+  /* Number of 2D tiles to be processed across edge1 3rd dimension */
+  int32_t dim3SizeFrontEnd = izmin - (start_z - dim3Edge1);
+  /* Offset calculation for Extend Edge across 3rd dimension excluding edges */
+  int32_t dim3CordMiddle = izmin - start_z;
+  /* Number of 2D tiles to be processed across 3rd dimension excluding edges */
+  int32_t dim3SizeMiddle = izmax - izmin + 1;
+  /* Offset calculation for Extend Edge across edge 2 3rd */
+  int32_t dim3CordRearEnd = izmax + 1 - start_z;
+  /* Number of 2D tiles processing to Extend Edge across 3rd edge2 dimension */
+  int32_t dim3SizeRearEnd = start_z + dim3Size + dim3Edge2 - 1 - izmax;
+
+  /* Update local 3D tile structure with dstTile structure parameters. Local   */
+  /* 3D tile structure is used as parameter to implement fillTile functionality */
+  xai_tile3D dst_t;
+  /* Update parameters for local 3D tile */
+  XAI_TILE3D_SET_DIM1(&dst_t, dim1Size);
+  XAI_TILE3D_SET_DIM1_PITCH(&dst_t, dstDataPitch1);
+  XAI_TILE3D_SET_DIM1_EDGE1(&dst_t, dim1Edge1);
+  XAI_TILE3D_SET_DIM1_EDGE2(&dst_t, dim1Edge2);
+  XAI_TILE3D_SET_DIM2(&dst_t, dim2Size);
+  XAI_TILE3D_SET_DIM2_PITCH(&dst_t, dstDataPitch2);
+  XAI_TILE3D_SET_DIM2_EDGE1(&dst_t, dim2Edge1);
+  XAI_TILE3D_SET_DIM2_EDGE2(&dst_t, dim2Edge2);
+  XAI_TILE3D_SET_DIM3_EDGE1(&dst_t, 0);
+  XAI_TILE3D_SET_DIM3_EDGE2(&dst_t, 0);
+  XAI_TILE3D_SET_DIM1_COORD(&dst_t, start_x);
+  XAI_TILE3D_SET_DIM2_COORD(&dst_t, start_y);
+  XAI_TILE3D_SET_DIM3_COORD(&dst_t, start_z);
+  XAI_TILE3D_SET_BUFF_PTR(&dst_t, XAI_TILE3D_GET_BUFF_PTR(dstTile));
+  XAI_TILE3D_SET_BUFF_SIZE(&dst_t, XAI_TILE3D_GET_BUFF_SIZE(dstTile));
+  XAI_TILE3D_SET_TYPE(&dst_t, XAI_TILE3D_GET_TYPE(dstTile));
+
+  /***********************************************************************************/
+  /* Processing across the 3rd dimension edges (edge1 and edge2)                     */
+  /* Processing across 3rd dimension edge 1 is referred as Front End Processing      */
+  /* Processing across 3rd dimension edge 2 is referred as Rear End Processing       */
+  /* Local copy of 3D tile is declared and updated with destination tile parameters. */
+  /* Size parameter across third dimension is updated based on number of 2D tiles    */
+  /* to be processed across front and read end. In order to effectively use the      */
+  /* SIMD capabilities xaiFillTile3D implementation is utilized.                      */
+  /***********************************************************************************/
+  if (dim3SizeFrontEnd > 0)
+  {
+    /***********************************************************************************/
+    /* Front end processing : Processing along the 3rd dimension edge 1.               */
+    /***********************************************************************************/
+
+    /* update destination data pointer */
+    pDst1 = &pDst[((-dim3Edge1) * dstDataPitch2)];
+    XAI_TILE3D_SET_DATA_PTR(&dst_t, pDst1);
+    XAI_TILE3D_SET_DIM3(&dst_t, dim3SizeFrontEnd);
+    MORPH_IDT_FILLTILE(&dst_t, value, 1);
+  }
+  if (dim3SizeRearEnd > 0)
+  {
+    /***********************************************************************************/
+    /* Rear end processing : Processing along the 3rd dimension edge 2.                */
+    /***********************************************************************************/
+
+    /* update destination data pointer */
+    pDst1 = &pDst[dim3CordRearEnd * dstDataPitch2];
+    XAI_TILE3D_SET_DATA_PTR(&dst_t, pDst1);
+    XAI_TILE3D_SET_DIM3(&dst_t, dim3SizeRearEnd);
+    MORPH_IDT_FILLTILE(&dst_t, value, 1);
+  }
+
+  /* Update destination data pointer */
+  pDst1 = &pDst[(dim3CordMiddle * dstDataPitch2)];
+  XAI_TILE3D_SET_DIM3(&dst_t, dim3SizeMiddle);
+  XAI_TILE3D_SET_DATA_PTR(&dst_t, pDst1);
+
+  MORPH_OP_FUNCTION_CONST(&dst_t, value, frame3DSize);
+  return(XAI_ERROR_STATUS());
+}
+
+
+/*====================================================================================*/
+/*============= END of xaiExtendEdgesConst3D_* routines ==============================*/
+/*====================================================================================*/
+
+
+
+
+/*====================================================================================*/
+/*============= START of xaiExtendEdges3D_* routines =================================*/
+/*====================================================================================*/
+
+/************************** extendWHEdges3D_I8  *****************************/
+/************************** extendWHEdges3D_I16 *****************************/
+/************************** extendWHEdges3D_F16 *****************************/
+/************************** extendWHEdges3D_F32 *****************************/
+/* Description : P6 implementation for extending the edges of a 3D tile     */
+/*               by filling different edge values for  different depths and */
+/*               extends the edges along dimension 1(W) and dimension 2(H)  */
+/*               3D tile                                                    */
+/* Inputs      : pValue(array of edge values)                               */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Destination Tile                                           */
+/* Assumptions : dstData is signed 8/16 bit Interger or half precision      */
+/*               float(FP16) or single precision float(FP32)                */
+/*               based on MORPH specifier.                                  */
+/****************************************************************************/
+static _XAI_INLINE_ void MAKE_NAME(extendWHEdges3D) MAKE_ARGUMENTS2(dstTile, pValue, frame3DSize)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t dim1Size  = XAI_TILE3D_GET_DIM1(dstTile);
+  const int32_t dim2Size  = XAI_TILE3D_GET_DIM2(dstTile);
+  const int32_t dim1Edge1 = XAI_TILE3D_GET_DIM1_EDGE1(dstTile);
+  const int32_t dim1Edge2 = XAI_TILE3D_GET_DIM1_EDGE2(dstTile);
+  const int32_t dim2Edge1 = XAI_TILE3D_GET_DIM2_EDGE1(dstTile);
+  const int32_t dim2Edge2 = XAI_TILE3D_GET_DIM2_EDGE2(dstTile);
+  int32_t dim3Size        = XAI_TILE3D_GET_DIM3(dstTile);
+
+  const int32_t dstDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(dstTile);
+  const int32_t dstDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(dstTile);
+  int32_t frame_dim1          = frame3DSize.dim1Size;
+  int32_t frame_dim2          = frame3DSize.dim2Size;
+  int32_t dim1ExtendEdgeSize  = dim1Size + dim1Edge1 + dim1Edge2;
+
+  int32_t start_x = XAI_TILE3D_GET_DIM1_COORD(dstTile);
+  int32_t start_y = XAI_TILE3D_GET_DIM2_COORD(dstTile);
+
+  MORPH_IDT_SCALAR *restrict pDst3D = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(dstTile);
+  int32_t ixmin                     = MAX2(start_x - dim1Edge1, 0);
+  int32_t ixmax                     = MIN2(start_x + dim1Size + dim1Edge2 - 1, frame_dim1 - 1);
+  int32_t iymin                     = MAX2(start_y - dim2Edge1, 0);
+  int32_t iymax                     = MIN2(start_y + dim2Size + dim2Edge2 - 1, frame_dim2 - 1);
+
+  int x, y, z; /* Loop variables */
+
+  // horizontal top
+  int32_t horTopXcord  = -dim1Edge1;
+  int32_t horTopYcord  = -dim2Edge1;
+  int32_t horTopWidth  = dim1Size + dim1Edge1 + dim1Edge2;
+  int32_t horTopHeight = iymin - (start_y - dim2Edge1);
+
+  // horizontal bottom
+  int32_t horBottomXcord  = -dim1Edge1;
+  int32_t horBottomYcord  = iymax + 1 - start_y;
+  int32_t horBottomWidth  = dim1Size + dim1Edge1 + dim1Edge2;
+  int32_t horBottomHeight = start_y + dim2Size + dim2Edge2 - 1 - iymax;
+
+  // vertical left
+  int32_t verLeftXcord  = -dim1Edge1;
+  int32_t verLeftYcord  = horTopYcord + horTopHeight;
+  int32_t verLeftWidth  = ixmin - (start_x - dim1Edge1);
+  int32_t verLeftHeight = iymax - iymin + 1;
+
+  // vertical right
+  int32_t verRightXcord  = ixmax + 1 - start_x;
+  int32_t verRightYcord  = horTopYcord + horTopHeight;
+  int32_t verRightWidth  = start_x + dim1Size + dim1Edge2 - 1 - ixmax;
+  int32_t verRightHeight = iymax - iymin + 1;
+
+  valign vaOutData1 = IVP_ZALIGN();
+  valign vaOutData2 = IVP_ZALIGN();
+
+  MORPH_IDT_VEC *restrict pdvecOut1, *restrict pdvecOut2;
+  MORPH_IDT_SCALAR *restrict pDst1, *restrict pDst2;
+
+  if (dstDataPitch1 == dim1ExtendEdgeSize)
+  {
+    for (z = 0; z < dim3Size; z += 2)
+    {
+      int32_t remZ = XT_SALT(1, dim3Size - z);  //remaining (dim3Size - z) greater than 1, then remZ = 1, else 0
+
+      const MORPH_IDT_SCALAR value1 = pValue[z];
+      const MORPH_IDT_SCALAR value2 = pValue[z + remZ];
+
+      // horizontal top
+      pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \
+              ((horTopYcord * dstDataPitch1) + horTopXcord);
+      pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \
+              ((horTopYcord * dstDataPitch1) + horTopXcord);
+      if (horTopHeight > 0)
+      {
+        pdvecOut1 = (MORPH_IDT_VEC *) (pDst1);
+        pdvecOut2 = (MORPH_IDT_VEC *) (pDst2);
+        for (x = 0; x < horTopWidth * horTopHeight; x += MORPH_VECTORIZATIONWIDTH)
+        {
+          MORPH_OP_STORE(value1, vaOutData1, pdvecOut1,
+                         sizeof(MORPH_IDT_SCALAR) * (horTopWidth * horTopHeight - x));
+          MORPH_OP_FLUSH(vaOutData1, pdvecOut1);
+
+          MORPH_OP_STORE(value2, vaOutData2, pdvecOut2,
+                         sizeof(MORPH_IDT_SCALAR) * (horTopWidth * horTopHeight - x) * remZ);
+          MORPH_OP_FLUSH(vaOutData2, pdvecOut2);
+        }
+      }
+
+      // horizontal bottom
+      pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \
+              ((horBottomYcord * dstDataPitch1) + horBottomXcord);
+      pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \
+              ((horBottomYcord * dstDataPitch1) + horBottomXcord);
+      if (horBottomHeight > 0)
+      {
+        pdvecOut1 = (MORPH_IDT_VEC *) (pDst1);
+        pdvecOut2 = (MORPH_IDT_VEC *) (pDst2);
+        for (x = 0; x < horBottomWidth * horBottomHeight; x += MORPH_VECTORIZATIONWIDTH)
+        {
+          MORPH_OP_STORE(value1, vaOutData1, pdvecOut1,
+                         sizeof(MORPH_IDT_SCALAR) * (horBottomWidth * horBottomHeight - x));
+          MORPH_OP_FLUSH(vaOutData1, pdvecOut1);
+
+          MORPH_OP_STORE(value2, vaOutData2, pdvecOut2,
+                         sizeof(MORPH_IDT_SCALAR) * (horBottomWidth * horBottomHeight - x) * remZ);
+          MORPH_OP_FLUSH(vaOutData2, pdvecOut2);
+        }
+      }
+    }
+  }
+  else
+  {
+    for (z = 0; z < dim3Size; z += 2)
+    {
+      int32_t remZ = XT_SALT(1, dim3Size - z);  //remaining (dim3Size - z) greater than 1, then remZ = 1, else 0
+
+      const MORPH_IDT_SCALAR value1 = pValue[z];
+      const MORPH_IDT_SCALAR value2 = pValue[z + remZ];
+
+      // horizontal top
+      pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \
+              ((horTopYcord * dstDataPitch1) + horTopXcord);
+      pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \
+              ((horTopYcord * dstDataPitch1) + horTopXcord);
+
+      if (horTopHeight > 0)
+      {
+        for (x = 0; x < horTopWidth; x += MORPH_VECTORIZATIONWIDTH)
+        {
+          int32_t remX = XT_MIN((horTopWidth - x), MORPH_VECTORIZATIONWIDTH);
+          for (y = 0; y < horTopHeight; y++)
+          {
+            pdvecOut1 = (MORPH_IDT_VEC *) (pDst1 + (y * dstDataPitch1) + x);
+            pdvecOut2 = (MORPH_IDT_VEC *) (pDst2 + (y * dstDataPitch1) + x);
+            MORPH_OP_STORE(value1, vaOutData1, pdvecOut1, sizeof(MORPH_IDT_SCALAR) * remX);
+            MORPH_OP_FLUSH(vaOutData1, pdvecOut1);
+            MORPH_OP_STORE(value2, vaOutData1, pdvecOut2, sizeof(MORPH_IDT_SCALAR) * remX * remZ);
+            MORPH_OP_FLUSH(vaOutData1, pdvecOut2);
+          }
+        }
+      } //if( horTopHeight > 0)
+
+      // horizontal bottom
+      pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \
+              ((horBottomYcord * dstDataPitch1) + horBottomXcord);
+      pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \
+              ((horBottomYcord * dstDataPitch1) + horBottomXcord);
+
+      if (horBottomHeight > 0)
+      {
+        for (x = 0; x < horBottomWidth; x += MORPH_VECTORIZATIONWIDTH)
+        {
+          int32_t remX = XT_MIN((horBottomWidth - x), MORPH_VECTORIZATIONWIDTH);
+          for (y = 0; y < horBottomHeight; y++)
+          {
+            pdvecOut1 = (MORPH_IDT_VEC *) (pDst1 + (y * dstDataPitch1) + x);
+            pdvecOut2 = (MORPH_IDT_VEC *) (pDst2 + (y * dstDataPitch1) + x);
+            MORPH_OP_STORE(value1, vaOutData1, pdvecOut1, sizeof(MORPH_IDT_SCALAR) * remX);
+            MORPH_OP_FLUSH(vaOutData1, pdvecOut1);
+            MORPH_OP_STORE(value2, vaOutData1, pdvecOut2, sizeof(MORPH_IDT_SCALAR) * remX * remZ);
+            MORPH_OP_FLUSH(vaOutData1, pdvecOut2);
+          }
+        }
+      }
+    }
+  }
+
+
+  for (z = 0; z < dim3Size; z += 2)
+  {
+    int32_t remZ = XT_SALT(1, dim3Size - z);  //remaining (dim3Size - z) greater than 1, then remZ = 1, else 0
+
+    const MORPH_IDT_SCALAR value1 = pValue[z];
+    const MORPH_IDT_SCALAR value2 = pValue[z + remZ];
+
+    // vertical left
+    pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \
+            ((verLeftYcord * dstDataPitch1) + verLeftXcord);
+    pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \
+            ((verLeftYcord * dstDataPitch1) + verLeftXcord);
+
+    for (x = 0; x < verLeftWidth; x += MORPH_VECTORIZATIONWIDTH)
+    {
+      int32_t remX = XT_MIN((verLeftWidth - x), MORPH_VECTORIZATIONWIDTH);
+      for (y = 0; y < verLeftHeight; y++)
+      {
+        pdvecOut1 = (MORPH_IDT_VEC *) (pDst1 + (y * dstDataPitch1) + x);
+        pdvecOut2 = (MORPH_IDT_VEC *) (pDst2 + (y * dstDataPitch1) + x);
+        MORPH_OP_STORE(value1, vaOutData1, pdvecOut1, sizeof(MORPH_IDT_SCALAR) * remX);
+        MORPH_OP_FLUSH(vaOutData1, pdvecOut1);
+        MORPH_OP_STORE(value2, vaOutData1, pdvecOut2, sizeof(MORPH_IDT_SCALAR) * remX * remZ);
+        MORPH_OP_FLUSH(vaOutData1, pdvecOut2);
+      }
+    }
+
+    // vertical right
+    pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \
+            ((verRightYcord * dstDataPitch1) + verRightXcord);
+    pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \
+            ((verRightYcord * dstDataPitch1) + verRightXcord);
+
+    for (x = 0; x < verRightWidth; x += MORPH_VECTORIZATIONWIDTH)
+    {
+      int32_t remX = XT_MIN((verRightWidth - x), MORPH_VECTORIZATIONWIDTH);
+
+      for (y = 0; y < verRightHeight; y++)
+      {
+        pdvecOut1 = (MORPH_IDT_VEC *) (pDst1 + (y * dstDataPitch1) + x);
+        pdvecOut2 = (MORPH_IDT_VEC *) (pDst2 + (y * dstDataPitch1) + x);
+        MORPH_OP_STORE(value1, vaOutData1, pdvecOut1, sizeof(MORPH_IDT_SCALAR) * remX);
+        MORPH_OP_FLUSH(vaOutData1, pdvecOut1);
+        MORPH_OP_STORE(value2, vaOutData1, pdvecOut2, sizeof(MORPH_IDT_SCALAR) * remX * remZ);
+        MORPH_OP_FLUSH(vaOutData1, pdvecOut2);
+      }
+    }
+  }
+}
+
+
+/***************************** extendEdges3D_I8_WHD ******************************/
+/***************************** extendEdges3D_I16_WHD *****************************/
+/***************************** extendEdges3D_F16_WHD *****************************/
+/***************************** extendEdges3D_F32_WHD *****************************/
+/* Description : P6 optimized generic implementation of xaiExtendEdgesConst 3D    */
+/*               function. Based on MORPH pre-processor specifiers, code         */
+/*               implementation is generated during preprocessing stage. This    */
+/*               method implements extendEdges3D_I8_WHD, extendEdges3D_I16_WHD,  */
+/*               extendEdges3D_F16_WHD and extendEdges3D_F32_WHD functionality   */
+/* Inputs      : constant value to fill the edges                                */
+/* Outputs     : XI Error Code                                                   */
+/* InOuts      : Destination Tile                                                */
+/* Assumptions : OutData is signed/unsigned 8/16 bit Interger or                 */
+/*               half precision float(FP16) or single precision float(FP32)      */
+/*               based on MORPH specifier                                        */
+/*********************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME_1(extendEdges3D, WHD) (xai_pTile3D dstTile,
+                                                          const xai_pArray pArray,
+                                                          xai_size3D frame3DSize)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(dstTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(dstTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(dstTile);
+  const int32_t dim1Edge1     = XAI_TILE3D_GET_DIM1_EDGE1(dstTile);
+  const int32_t dim1Edge2     = XAI_TILE3D_GET_DIM1_EDGE2(dstTile);
+  const int32_t dim2Edge1     = XAI_TILE3D_GET_DIM2_EDGE1(dstTile);
+  const int32_t dim2Edge2     = XAI_TILE3D_GET_DIM2_EDGE2(dstTile);
+  const int32_t dim3Edge1     = XAI_TILE3D_GET_DIM3_EDGE1(dstTile);
+  const int32_t dim3Edge2     = XAI_TILE3D_GET_DIM3_EDGE2(dstTile);
+  const int32_t dstDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(dstTile);
+  const int32_t dstDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(dstTile);
+
+  int32_t frame_dim1 = frame3DSize.dim1Size;
+  int32_t frame_dim2 = frame3DSize.dim2Size;
+  int32_t frame_dim3 = frame3DSize.dim3Size;
+  int32_t start_x    = XAI_TILE3D_GET_DIM1_COORD(dstTile);
+  int32_t start_y    = XAI_TILE3D_GET_DIM2_COORD(dstTile);
+  int32_t start_z    = XAI_TILE3D_GET_DIM3_COORD(dstTile);
+
+  int32_t ixmin = MAX2(start_x - dim1Edge1, 0);
+  int32_t ixmax = MIN2(start_x + dim1Size + dim1Edge2 - 1, frame_dim1 - 1);
+  int32_t iymin = MAX2(start_y - dim2Edge1, 0);
+  int32_t iymax = MIN2(start_y + dim2Size + dim2Edge2 - 1, frame_dim2 - 1);
+  int32_t izmin = MAX2(start_z - dim3Edge1, 0);
+  int32_t izmax = MIN2(start_z + dim3Size + dim3Edge2 - 1, frame_dim3 - 1);
+
+  /* Update local 3D tile structure with dstTile structure parameters. Local   */
+  /* 3D tile structure is used as parameter to implement fillTile functionality */
+  xai_tile3D dst_t;
+  XAI_TILE3D_SET_DIM1(&dst_t, dim1Size);
+  XAI_TILE3D_SET_DIM1_PITCH(&dst_t, dstDataPitch1);
+  XAI_TILE3D_SET_DIM1_EDGE1(&dst_t, dim1Edge1);
+  XAI_TILE3D_SET_DIM1_EDGE2(&dst_t, dim1Edge2);
+  XAI_TILE3D_SET_DIM2(&dst_t, dim2Size);
+  XAI_TILE3D_SET_DIM2_PITCH(&dst_t, dstDataPitch2);
+  XAI_TILE3D_SET_DIM2_EDGE1(&dst_t, dim2Edge1);
+  XAI_TILE3D_SET_DIM2_EDGE2(&dst_t, dim2Edge2);
+  XAI_TILE3D_SET_DIM3_EDGE1(&dst_t, 0);
+  XAI_TILE3D_SET_DIM3_EDGE2(&dst_t, 0);
+  XAI_TILE3D_SET_DIM1_COORD(&dst_t, start_x);
+  XAI_TILE3D_SET_DIM2_COORD(&dst_t, start_y);
+  XAI_TILE3D_SET_DIM3_COORD(&dst_t, start_z);
+  XAI_TILE3D_SET_BUFF_PTR(&dst_t, XAI_TILE3D_GET_BUFF_PTR(dstTile));
+  XAI_TILE3D_SET_BUFF_SIZE(&dst_t, XAI_TILE3D_GET_BUFF_SIZE(dstTile));
+  XAI_TILE3D_SET_TYPE(&dst_t, XAI_TILE3D_GET_TYPE(dstTile));
+
+  MORPH_IDT_SCALAR *pDst         = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(dstTile);
+  const MORPH_IDT_SCALAR *pValue = (MORPH_IDT_SCALAR *) XAI_ARRAY_GET_DATA_PTR(pArray);
+  int32_t z; /* Loop variable */
+  MORPH_IDT_SCALAR *pDst1;
+  MORPH_IDT_SCALAR value;
+
+  /* Validation for Tile and Frame intersection */
+  int32_t frameIntersectionFlag = ((ixmin > ixmax) || (iymin > iymax) || (izmin > izmax));
+
+  /*********************************************************************************/
+  /* P6 implementation of xaiExtendEdges3D is similar to xaiExtendEdgesConst3D       */
+  /* implementation. In ExtendEdges functionality a unique value is used to        */
+  /* xaiExtendEdges, in xaiExtendEdges3D implementation each 2D tile is filled       */
+  /* with a value from xai_array, index by the co-ordinate position across third    */
+  /* dimension. In xaiExtendEdges3D implementation processing across 3rd            */
+  /* dimension edges, extendEdges need to perform for the entire 2D tile.          */
+  /* xaiExtendEdges3D processing is split into 3 parts. ExtendEdges processing      */
+  /* across 3rd dimension edges is split as front end and rear end processing.     */
+  /* Processing across 3rd dimension excluding the edge is implemented similar to  */
+  /* 2D implementation of extendEdges functionality.                               */
+  /*********************************************************************************/
+
+  if (frameIntersectionFlag)
+  {
+    /* If frameIntersectionFlag is enabled the tile exists outside frame boundary */
+    /* and ExtendEdges need to be done on the entire 3D tile.                     */
+
+    const int32_t dim3FillSize = dim3Size + dim3Edge1 + dim3Edge2;
+    pDst1 = &pDst[((-dim3Edge1) * dstDataPitch2)];
+    for (z = 0; z < dim3FillSize; z++) /* Loop across dim3 */
+    {
+      value = pValue[z];
+      /* update destination data pointer */
+      MORPH_IDT_SCALAR *pDst2 = pDst1 + (z * dstDataPitch2);
+      XAI_TILE3D_SET_DATA_PTR(&dst_t, pDst2);
+      XAI_TILE3D_SET_DIM3(&dst_t, 1);
+      MORPH_IDT_FILLTILE(&dst_t, value, 1);
+    }
+    return;
+  }
+
+  /* Number of 2D tiles to be processed across edge1 3rd dimension */
+  int32_t dim3SizeFrontEnd = izmin - (start_z - dim3Edge1);
+  /* Offset calculation for Extend Edge across 3rd dimension excluding edges */
+  int32_t dim3CordMiddle = izmin - start_z;
+  /* Number of 2D tiles to be processed across 3rd dimension excluding edges */
+  int32_t dim3SizeMiddle = izmax - izmin + 1;
+  /* Offset calculation for Extend Edge across edge 2 3rd */
+  int32_t dim3CordRearEnd = izmax + 1 - start_z;
+  /* Number of 2D tiles processing to Extend Edge across 3rd edge2 dimension */
+  int32_t dim3SizeRearEnd = start_z + dim3Size + dim3Edge2 - 1 - izmax;
+
+  /***********************************************************************************/
+  /* Processing across the 3rd dimension edges (edge1 and edge2)                     */
+  /* Processing across 3rd dimension edge 1 is referred as Front End Processing      */
+  /* Processing across 3rd dimension edge 2 is referred as Rear End Processing       */
+  /* Local copy of 3D tile is declared and updated with destination tile parameters. */
+  /* Size parameter across third dimension is updated based on number of 2D tiles    */
+  /* to be processed across front and read end. In order to effectively use the      */
+  /* SIMD capabilities xaiFillTile3D implementation is utilized.                      */
+  /***********************************************************************************/
+
+  if (dim3SizeFrontEnd > 0)
+  {
+    /***********************************************************************************/
+    /* Front end processing : Processing along the 3rd dimension edge 1.               */
+    /***********************************************************************************/
+
+    /* Update destination data pointer */
+    pDst1 = &pDst[((-dim3Edge1) * dstDataPitch2)];
+    XAI_TILE3D_SET_DIM3(&dst_t, 1);
+    for (z = 0; z < dim3SizeFrontEnd; z++) /* Loop across dim3 */
+    {
+      value = pValue[z];
+      /* update destination data pointer */
+      MORPH_IDT_SCALAR *pDst2 = pDst1 + (z * dstDataPitch2);
+      XAI_TILE3D_SET_DATA_PTR(&dst_t, pDst2);
+      MORPH_IDT_FILLTILE(&dst_t, value, 1);
+    }
+  }
+  if (dim3SizeRearEnd > 0)
+  {
+    /***********************************************************************************/
+    /* Rear end processing : Processing along the 3rd dimension edge 2.                */
+    /***********************************************************************************/
+
+    /* Update destination data pointer */
+    pDst1 = &pDst[(dim3CordRearEnd * dstDataPitch2)];
+    XAI_TILE3D_SET_DIM3(&dst_t, 1);
+    for (z = 0; z < dim3SizeRearEnd; z++) /* Loop across dim3 */
+    {
+      /* update destination data pointer */
+      MORPH_IDT_SCALAR *pDst2 = pDst1 + (z * dstDataPitch2);
+      value = pValue[z + dim3CordRearEnd + dim3Edge1];
+      XAI_TILE3D_SET_DATA_PTR(&dst_t, pDst2);
+      MORPH_IDT_FILLTILE(&dst_t, value, 1);
+    }
+  }
+
+  /* Update destination data pointer */
+  pDst1 = &pDst[(dim3CordMiddle * dstDataPitch2)];
+  XAI_TILE3D_SET_DIM3(&dst_t, dim3SizeMiddle);
+
+  XAI_TILE3D_SET_DATA_PTR(&dst_t, pDst1);
+  MORPH_OP_FUNCTION(&dst_t, pValue + dim3CordMiddle + dim3Edge1, frame3DSize);
+}
+
+/*************************** extendEdges3D_I8_DWH *********************************/
+/*************************** extendEdges3D_I16_DWH ********************************/
+/*************************** extendEdges3D_F16_DWH ********************************/
+/*************************** extendEdges3D_F32_DWH ********************************/
+/* Description : P6 optimized generic implementation of xaiExtendEdgesConst 3D    */
+/*               function. Based on MORPH pre-processor specifiers, code          */
+/*               implementation is generated during preprocessing stage. This     */
+/*               method implements extendEdges3D_I8_DWH and extendEdges3D_I16_DWH */
+/*               extendEdges3D_F16_DWH and extendEdges3D_F32_DWH functionality.   */
+/* Inputs      : constant value to fill the edges                                 */
+/* Outputs     : XI Error Code                                                    */
+/* InOuts      : Destination Tile                                                 */
+/* Assumptions : OutData is signed/unsigned 8/16 bit Interger or                  */
+/*               half precision float(FP16) or single precision float(FP32)       */
+/*               based on MORPH specifier                                         */
+/**********************************************************************************/
+
+static _XAI_INLINE_ void MAKE_NAME_1(extendEdges3D, DWH) (xai_pTile3D dstTile,
+                                                          const xai_pArray pArray,
+                                                          xai_size3D frame3DSize)
+{
+  /* Getting parameters from the tile structures */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(dstTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(dstTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(dstTile);
+  const int32_t dim1Edge1     = XAI_TILE3D_GET_DIM1_EDGE1(dstTile);
+  const int32_t dim1Edge2     = XAI_TILE3D_GET_DIM1_EDGE2(dstTile);
+  const int32_t dim2Edge1     = XAI_TILE3D_GET_DIM2_EDGE1(dstTile);
+  const int32_t dim2Edge2     = XAI_TILE3D_GET_DIM2_EDGE2(dstTile);
+  const int32_t dim3Edge1     = XAI_TILE3D_GET_DIM3_EDGE1(dstTile);
+  const int32_t dim3Edge2     = XAI_TILE3D_GET_DIM3_EDGE2(dstTile);
+  const int32_t dstDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(dstTile);
+  const int32_t dstDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(dstTile);
+  const int32_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(dstTile);
+
+  int32_t frame_dim1 = frame3DSize.dim1Size;
+  int32_t frame_dim2 = frame3DSize.dim2Size;
+  int32_t frame_dim3 = frame3DSize.dim3Size;
+  int32_t start_x    = XAI_TILE3D_GET_DIM1_COORD(dstTile); // along Depth
+  int32_t start_y    = XAI_TILE3D_GET_DIM2_COORD(dstTile); // along Width
+  int32_t start_z    = XAI_TILE3D_GET_DIM3_COORD(dstTile); // along Height
+
+  int32_t ixmin = MAX2(start_x - dim1Edge1, 0);
+  int32_t ixmax = MIN2(start_x + dim1Size + dim1Edge2 - 1, frame_dim1 - 1);
+  int32_t iymin = MAX2(start_y - dim2Edge1, 0);
+  int32_t iymax = MIN2(start_y + dim2Size + dim2Edge2 - 1, frame_dim2 - 1);
+  int32_t izmin = MAX2(start_z - dim3Edge1, 0);
+  int32_t izmax = MIN2(start_z + dim3Size + dim3Edge2 - 1, frame_dim3 - 1);
+
+  // horizontal top
+  int32_t horTopXcord  = -dim1Edge1;
+  int32_t horTopYcord  = -dim2Edge1;
+  int32_t horTopWidth  = dim1Size + dim1Edge1 + dim1Edge2;
+  int32_t horTopHeight = iymin - (start_y - dim2Edge1);
+
+  // horizontal bottom
+  int32_t horBottomXcord  = -dim1Edge1;
+  int32_t horBottomYcord  = iymax + 1 - start_y;
+  int32_t horBottomWidth  = dim1Size + dim1Edge1 + dim1Edge2;
+  int32_t horBottomHeight = start_y + dim2Size + dim2Edge2 - 1 - iymax;
+
+  // vertical left
+  int32_t verLeftXcord  = -dim1Edge1;
+  int32_t verLeftYcord  = horTopYcord + horTopHeight;
+  int32_t verLeftWidth  = ixmin - (start_x - dim1Edge1);
+  int32_t verLeftHeight = iymax - iymin + 1;
+
+  // vertical right
+  int32_t verRightXcord  = ixmax + 1 - start_x;
+  int32_t verRightYcord  = horTopYcord + horTopHeight;
+  int32_t verRightWidth  = start_x + dim1Size + dim1Edge2 - 1 - ixmax;
+  int32_t verRightHeight = iymax - iymin + 1;
+
+  // front
+  int32_t frontXcord  = -dim1Edge1;
+  int32_t frontYcord  = horTopYcord + horTopHeight;
+  int32_t frontZcord  = -dim3Edge1;
+  int32_t frontDepth  = izmin - (start_z - dim3Edge1);
+  int32_t frontWidth  = horTopWidth;
+  int32_t frontHeight = iymax - iymin + 1;
+
+  // rear
+  int32_t rearXcord  = -dim1Edge1;
+  int32_t rearYcord  = horTopYcord + horTopHeight;
+  int32_t rearZcord  = izmax + 1 - start_z;
+  int32_t rearDepth  = start_z + dim3Size + dim3Edge2 - 1 - izmax;
+  int32_t rearWidth  = horTopWidth;
+  int32_t rearHeight = iymax - iymin + 1;
+
+  int x, y, z; /* Loop variables */
+  valign vaOutData = IVP_ZALIGN();
+  valign vaArray;
+  int32_t vectorizationWidth = MORPH_VECTORIZATIONWIDTH;
+
+  MORPH_IDT_SCALAR *restrict pDst3D = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(dstTile);
+  MORPH_IDT_SCALAR *restrict pArr   = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(pArray) + dim1Edge1;
+
+  MORPH_IDT_VEC *restrict pdvecArr, *restrict pdvecDst;
+  MORPH_IDT_VEC dvecArrData;
+
+  /* Tile and frame intersection is empty,fill entire tile with edge values */
+  if ((ixmin > ixmax) || (iymin > iymax) || (izmin > izmax))
+  {
+    pdvecArr = (MORPH_IDT_VEC *) (pArr - dim1Edge1);
+
+    /* priming of pArray */
+    vaArray = MORPH_OP_PRIME(pdvecArr);
+
+    for (x = 0; x < (dim1Size + dim1Edge1 + dim1Edge2); x += vectorizationWidth)
+    {
+      /* Load pArray */
+      MORPH_OP_LOAD(dvecArrData, vaArray, pdvecArr, (dim1Size + dim1Edge1 + dim1Edge2 - x) * bytesPerPixel);
+
+      for (z = 0; z < (dim3Size + dim3Edge1 + dim3Edge2); z++)
+      {
+        for (y = 0; y < (dim2Size + dim2Edge1 + dim2Edge2); y++)
+        {
+          pdvecDst = (MORPH_IDT_VEC *) (pDst3D + (z - dim3Edge1) * dstDataPitch2 + \
+                                        (y - dim2Edge1) * dstDataPitch1 + (-dim1Edge1) + x);
+
+          /* store array value in destination */
+          MORPH_OP_STORE(dvecArrData, vaOutData, pdvecDst, (dim1Size + dim1Edge1 + dim1Edge2 - x) * bytesPerPixel);
+
+          MORPH_OP_FLUSH(vaOutData, pdvecDst);
+        }
+      }
+    }
+  }
+  else
+  {
+    /* Front Height Edge */
+    if (frontDepth > 0)
+    {
+      pdvecArr = (MORPH_IDT_VEC *) (pArr + frontXcord);
+
+      /* priming of pArray */
+      vaArray = MORPH_OP_PRIME(pdvecArr);
+
+      for (x = 0; x < frontWidth; x += vectorizationWidth)
+      {
+        /* Load pArray */
+        MORPH_OP_LOAD(dvecArrData, vaArray, pdvecArr, (frontWidth - x) * bytesPerPixel);
+
+        for (z = 0; z < frontDepth; z++)
+        {
+          for (y = 0; y < frontHeight; y++)
+          {
+            pdvecDst = (MORPH_IDT_VEC *) (pDst3D + (frontZcord + z) * dstDataPitch2 + \
+                                          (y + frontYcord) * dstDataPitch1 + frontXcord + x);
+
+            /* store array value in destination */
+            MORPH_OP_STORE(dvecArrData, vaOutData, pdvecDst, (frontWidth - x) * bytesPerPixel);
+
+            MORPH_OP_FLUSH(vaOutData, pdvecDst);
+          }
+        }
+      }
+    }
+
+    /* Rear Height Edge */
+    if (rearDepth > 0)
+    {
+      pdvecArr = (MORPH_IDT_VEC *) (pArr + rearXcord);
+
+      /* priming of pArray */
+      vaArray = MORPH_OP_PRIME(pdvecArr);
+
+      for (x = 0; x < rearWidth; x += vectorizationWidth)
+      {
+        /* Load pArray */
+        MORPH_OP_LOAD(dvecArrData, vaArray, pdvecArr, (rearWidth - x) * bytesPerPixel);
+
+        for (z = 0; z < rearDepth; z++)
+        {
+          for (y = 0; y < rearHeight; y++)
+          {
+            pdvecDst = (MORPH_IDT_VEC *) (pDst3D + (rearZcord + z) * dstDataPitch2 + \
+                                          (y + rearYcord) * dstDataPitch1 + rearXcord + x);
+
+            /* store array value in destination */
+            MORPH_OP_STORE(dvecArrData, vaOutData, pdvecDst, (rearWidth - x) * bytesPerPixel);
+
+            MORPH_OP_FLUSH(vaOutData, pdvecDst);
+          }
+        }
+      }
+    }
+
+    /* Top Width Edge */
+    if (horTopHeight > 0)
+    {
+      pdvecArr = (MORPH_IDT_VEC *) (pArr + horTopXcord);
+
+      /* priming of pArray */
+      vaArray = MORPH_OP_PRIME(pdvecArr);
+
+      for (x = 0; x < horTopWidth; x += vectorizationWidth)
+      {
+        /* Load pArray */
+        MORPH_OP_LOAD(dvecArrData, vaArray, pdvecArr, (horTopWidth - x) * bytesPerPixel);
+
+        for (z = 0; z < (dim3Size + dim3Edge1 + dim3Edge2); z++)
+        {
+          for (y = 0; y < horTopHeight; y++)
+          {
+            pdvecDst = (MORPH_IDT_VEC *) (pDst3D + (z - dim3Edge1) * dstDataPitch2 + \
+                                          (horTopYcord + y) * dstDataPitch1 + horTopXcord + x);
+
+            /* store array value in destination */
+            MORPH_OP_STORE(dvecArrData, vaOutData, pdvecDst, (horTopWidth - x) * bytesPerPixel);
+
+            MORPH_OP_FLUSH(vaOutData, pdvecDst);
+          }
+        }
+      }
+    }
+
+    /* Bottom Width Edge */
+    if (horBottomHeight > 0)
+    {
+      pdvecArr = (MORPH_IDT_VEC *) (pArr + horBottomXcord);
+
+      /* priming of pArray */
+      vaArray = MORPH_OP_PRIME(pdvecArr);
+
+      for (x = 0; x < horBottomWidth; x += vectorizationWidth)
+      {
+        /* Load pArray */
+        MORPH_OP_LOAD(dvecArrData, vaArray, pdvecArr, (horBottomWidth - x) * bytesPerPixel);
+
+        for (z = 0; z < (dim3Size + dim3Edge1 + dim3Edge2); z++)
+        {
+          for (y = 0; y < horBottomHeight; y++)
+          {
+            pdvecDst = (MORPH_IDT_VEC *) (pDst3D + (z - dim3Edge1) * dstDataPitch2 + \
+                                          (horBottomYcord + y) * dstDataPitch1 + horBottomXcord + x);
+
+            /* store array value in destination */
+            MORPH_OP_STORE(dvecArrData, vaOutData, pdvecDst, (horBottomWidth - x) * bytesPerPixel);
+
+            MORPH_OP_FLUSH(vaOutData, pdvecDst);
+          }
+        }
+      }
+    }
+
+    /* Left Depth Edge */
+    if (verLeftWidth > 0)
+    {
+      pdvecArr = (MORPH_IDT_VEC *) (pArr + verLeftXcord);
+
+      /* priming of pArray */
+      vaArray = MORPH_OP_PRIME(pdvecArr);
+
+      for (x = 0; x < verLeftWidth; x += vectorizationWidth)
+      {
+        /* Load pArray */
+        MORPH_OP_LOAD(dvecArrData, vaArray, pdvecArr, (verLeftWidth - x) * bytesPerPixel);
+
+        for (z = 0; z < (dim3Size + dim3Edge1 + dim3Edge2); z++)
+        {
+          for (y = 0; y < verLeftHeight; y++)
+          {
+            pdvecDst = (MORPH_IDT_VEC *) (pDst3D + (z - dim3Edge1) * dstDataPitch2 + \
+                                          (verLeftYcord + y) * dstDataPitch1 + verLeftXcord + x);
+
+            /* store array value in destination */
+            MORPH_OP_STORE(dvecArrData, vaOutData, pdvecDst, (verLeftWidth - x) * bytesPerPixel);
+
+            MORPH_OP_FLUSH(vaOutData, pdvecDst);
+          }
+        }
+      }
+    }
+
+    /* Right Depth Edge */
+    if (verRightWidth > 0)
+    {
+      pdvecArr = (MORPH_IDT_VEC *) (pArr + verRightXcord);
+
+      /* priming of pArray */
+      vaArray = MORPH_OP_PRIME(pdvecArr);
+
+      for (x = 0; x < verRightWidth; x += vectorizationWidth)
+      {
+        /* Load pArray */
+        MORPH_OP_LOAD(dvecArrData, vaArray, pdvecArr, (verRightWidth - x) * bytesPerPixel);
+
+        for (z = 0; z < (dim3Size + dim3Edge1 + dim3Edge2); z++)
+        {
+          for (y = 0; y < verRightHeight; y++)
+          {
+            pdvecDst = (MORPH_IDT_VEC *) (pDst3D + (z - dim3Edge1) * dstDataPitch2 + \
+                                          (verRightYcord + y) * dstDataPitch1 + verRightXcord + x);
+
+            /* store array value in destination */
+            MORPH_OP_STORE(dvecArrData, vaOutData, pdvecDst, (verRightWidth - x) * bytesPerPixel);
+
+            MORPH_OP_FLUSH(vaOutData, pdvecDst);
+          }
+        }
+      }
+    }
+  }
+}
+
+#if INPUT_DATA_TYPE == INTEGER8BIT
+/***********************   xaiExtendEdges3D_I8   *****************************/
+/* Description : General API for ExtendEdges3D optimized implementation     */
+/*               Calls one of the ExtendEdges3D functions based             */
+/*               on the parameters                                          */
+/* Inputs      : pArray, frame3DSize                                        */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Input Tile                                                 */
+/****************************************************************************/
+XAI_ERR_TYPE xaiExtendEdges3D_I8(xai_pTile3D dstTile,
+                                 const xai_pArray pArray,
+                                 xai_size3D frame3DSize)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_I8(dstTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(dstTile);
+    XAI_CHECK_ERROR(
+      ((XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_WHD) || (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_DWH)), \
+      XAI_ERR_BADARG, "Provided Data Order not supported.");
+    XAI_CHECK_POINTER(pArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_IS_CONSISTENT(pArray), XAI_ERR_BADARG, "The argument pArray is invalid");
+    XAI_CHECK_ERROR((frame3DSize.dim1Size > 0) && (frame3DSize.dim2Size > 0) &&                                                             \
+                    (frame3DSize.dim3Size > 0), XAI_ERR_DATASIZE,                                                                           \
+                    "\nframe3DSize.dim1Size = %d, frame3DSize.dim2Size = %d, frame3DSize.dim3Size = %d\nDimensions must be greater than 0", \
+                    frame3DSize.dim1Size, frame3DSize.dim2Size, frame3DSize.dim3Size);
+  }
+  if (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_WHD)
+  {
+    XAI_ERROR_CHECKS_CONTINUE()
+    {
+      XAI_CHECK_ERROR(
+        ((XAI_ARRAY_GET_WIDTH(pArray) >= (XAI_TILE3D_GET_DIM3(dstTile)                                                                  \
+                                          + XAI_TILE3D_GET_DIM3_EDGE1(dstTile) + XAI_TILE3D_GET_DIM3_EDGE2(dstTile)))), XAI_ERR_BADARG, \
+        "pArray width parameter is not set as required");
+    }
+    extendEdges3D_I8_WHD(dstTile, pArray, frame3DSize);
+  }
+  else if (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_DWH)
+  {
+    XAI_ERROR_CHECKS_CONTINUE()
+    {
+      XAI_CHECK_ERROR(
+        ((XAI_ARRAY_GET_WIDTH(pArray) >= (XAI_TILE3D_GET_DIM1(dstTile)                                                                  \
+                                          + XAI_TILE3D_GET_DIM1_EDGE1(dstTile) + XAI_TILE3D_GET_DIM1_EDGE2(dstTile)))), XAI_ERR_BADARG, \
+        "pArray width parameter is not set as required");
+    }
+    extendEdges3D_I8_DWH(dstTile, pArray, frame3DSize);
+  }
+  else
+  {
+    return(XAI_ERR_NO_VARIANT);
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
+#elif INPUT_DATA_TYPE == INTEGER16BIT
+/***********************   xaiExtendEdges3D_I16   ****************************/
+/* Description : General API for ExtendEdges3D optimized implementation     */
+/*               Calls one of the ExtendEdges3D functions based             */
+/*               on the parameters                                          */
+/* Inputs      : pArray, frame3DSize                                        */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Input Tile                                                 */
+/****************************************************************************/
+XAI_ERR_TYPE xaiExtendEdges3D_I16(xai_pTile3D dstTile,
+                                  const xai_pArray pArray,
+                                  xai_size3D frame3DSize)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_X16(dstTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(dstTile);
+    XAI_CHECK_ERROR(
+      ((XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_WHD) || (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_DWH)), \
+      XAI_ERR_BADARG, "Provided Data Order not supported.");
+    XAI_CHECK_POINTER(pArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_IS_CONSISTENT(pArray), XAI_ERR_BADARG, "The argument pArray is invalid");
+    XAI_CHECK_ERROR((frame3DSize.dim1Size > 0) && (frame3DSize.dim2Size > 0) &&                                                             \
+                    (frame3DSize.dim3Size > 0), XAI_ERR_DATASIZE,                                                                           \
+                    "\nframe3DSize.dim1Size = %d, frame3DSize.dim2Size = %d, frame3DSize.dim3Size = %d\nDimensions must be greater than 0", \
+                    frame3DSize.dim1Size, frame3DSize.dim2Size, frame3DSize.dim3Size);
+  }
+  if (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_WHD)
+  {
+    XAI_ERROR_CHECKS_CONTINUE()
+    {
+      XAI_CHECK_ERROR(
+        ((XAI_ARRAY_GET_WIDTH(pArray) >= (XAI_TILE3D_GET_DIM3(dstTile)                                                                  \
+                                          + XAI_TILE3D_GET_DIM3_EDGE1(dstTile) + XAI_TILE3D_GET_DIM3_EDGE2(dstTile)))), XAI_ERR_BADARG, \
+        "pArray width parameter is not set as required");
+    }
+    extendEdges3D_I16_WHD(dstTile, pArray, frame3DSize);
+  }
+  else if (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_DWH)
+  {
+    XAI_ERROR_CHECKS_CONTINUE()
+    {
+      XAI_CHECK_ERROR(
+        ((XAI_ARRAY_GET_WIDTH(pArray) >= (XAI_TILE3D_GET_DIM1(dstTile)                                                                  \
+                                          + XAI_TILE3D_GET_DIM1_EDGE1(dstTile) + XAI_TILE3D_GET_DIM1_EDGE2(dstTile)))), XAI_ERR_BADARG, \
+        "pArray width parameter is not set as required");
+    }
+    extendEdges3D_I16_DWH(dstTile, pArray, frame3DSize);
+  }
+  else
+  {
+    return(XAI_ERR_NO_VARIANT);
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
+#elif INPUT_DATA_TYPE == FLOAT16BIT
+#if (XCHAL_HAVE_VISION_HP_VFPU == 1)
+/***********************   xaiExtendEdges3D_F16   ****************************/
+/* Description : General API for ExtendEdges3D optimized implementation     */
+/*               Calls one of the ExtendEdges3D functions based             */
+/*               on the parameters                                          */
+/* Inputs      : pArray, frame3DSize                                        */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Input Tile                                                 */
+/****************************************************************************/
+XAI_ERR_TYPE xaiExtendEdges3D_F16(xai_pTile3D dstTile,
+                                  const xai_pArray pArray,
+                                  xai_size3D frame3DSize)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_F16(dstTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(dstTile);
+    XAI_CHECK_ERROR(
+      ((XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_WHD) || (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_DWH)), \
+      XAI_ERR_BADARG, "Provided Data Order not supported.");
+    XAI_CHECK_POINTER(pArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_IS_CONSISTENT(pArray), XAI_ERR_BADARG, "The argument pArray is invalid");
+    XAI_CHECK_ERROR((frame3DSize.dim1Size > 0) && (frame3DSize.dim2Size > 0) &&                                                             \
+                    (frame3DSize.dim3Size > 0), XAI_ERR_DATASIZE,                                                                           \
+                    "\nframe3DSize.dim1Size = %d, frame3DSize.dim2Size = %d, frame3DSize.dim3Size = %d\nDimensions must be greater than 0", \
+                    frame3DSize.dim1Size, frame3DSize.dim2Size, frame3DSize.dim3Size);
+  }
+  if (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_WHD)
+  {
+    XAI_ERROR_CHECKS_CONTINUE()
+    {
+      XAI_CHECK_ERROR(
+        ((XAI_ARRAY_GET_WIDTH(pArray) >= (XAI_TILE3D_GET_DIM3(dstTile)                                                                  \
+                                          + XAI_TILE3D_GET_DIM3_EDGE1(dstTile) + XAI_TILE3D_GET_DIM3_EDGE2(dstTile)))), XAI_ERR_BADARG, \
+        "pArray width parameter is not set as required");
+    }
+    extendEdges3D_F16_WHD(dstTile, pArray, frame3DSize);
+  }
+  else if (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_DWH)
+  {
+    XAI_ERROR_CHECKS_CONTINUE()
+    {
+      XAI_CHECK_ERROR(
+        ((XAI_ARRAY_GET_WIDTH(pArray) >= (XAI_TILE3D_GET_DIM1(dstTile)                                                                  \
+                                          + XAI_TILE3D_GET_DIM1_EDGE1(dstTile) + XAI_TILE3D_GET_DIM1_EDGE2(dstTile)))), XAI_ERR_BADARG, \
+        "pArray width parameter is not set as required");
+    }
+    extendEdges3D_F16_DWH(dstTile, pArray, frame3DSize);
+  }
+  return(XAI_ERROR_STATUS());
+}
+#endif //#if (XCHAL_HAVE_VISION_HP_VFPU == 1)
+
+#elif INPUT_DATA_TYPE == FLOAT32BIT
+#if (XCHAL_HAVE_VISION_SP_VFPU == 1)
+/***********************   xaiExtendEdges3D_F32   ****************************/
+/* Description : General API for ExtendEdges3D optimized implementation     */
+/*               Calls one of the ExtendEdges3D functions based             */
+/*               on the parameters                                          */
+/* Inputs      : pArray, frame3DSize                                        */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Input Tile                                                 */
+/****************************************************************************/
+XAI_ERR_TYPE xaiExtendEdges3D_F32(xai_pTile3D dstTile,
+                                  const xai_pArray pArray,
+                                  xai_size3D frame3DSize)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_F32(dstTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(dstTile);
+    XAI_CHECK_ERROR(
+      ((XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_WHD) || (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_DWH)), \
+      XAI_ERR_BADARG, "Provided Data Order not supported.");
+    XAI_CHECK_POINTER(pArray);
+    XAI_CHECK_ERROR(XAI_ARRAY_IS_CONSISTENT(pArray), XAI_ERR_BADARG, "The argument pArray is invalid");
+    XAI_CHECK_ERROR((frame3DSize.dim1Size > 0) && (frame3DSize.dim2Size > 0) &&                                                             \
+                    (frame3DSize.dim3Size > 0), XAI_ERR_DATASIZE,                                                                           \
+                    "\nframe3DSize.dim1Size = %d, frame3DSize.dim2Size = %d, frame3DSize.dim3Size = %d\nDimensions must be greater than 0", \
+                    frame3DSize.dim1Size, frame3DSize.dim2Size, frame3DSize.dim3Size);
+  }
+  if (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_WHD)
+  {
+    XAI_ERROR_CHECKS_CONTINUE()
+    {
+      XAI_CHECK_ERROR(
+        ((XAI_ARRAY_GET_WIDTH(pArray) >= (XAI_TILE3D_GET_DIM3(dstTile)                                                                  \
+                                          + XAI_TILE3D_GET_DIM3_EDGE1(dstTile) + XAI_TILE3D_GET_DIM3_EDGE2(dstTile)))), XAI_ERR_BADARG, \
+        "pArray width parameter is not set as required");
+    }
+    extendEdges3D_F32_WHD(dstTile, pArray, frame3DSize);
+  }
+  else if (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_DWH)
+  {
+    XAI_ERROR_CHECKS_CONTINUE()
+    {
+      XAI_CHECK_ERROR(
+        ((XAI_ARRAY_GET_WIDTH(pArray) >= (XAI_TILE3D_GET_DIM1(dstTile)                                                                  \
+                                          + XAI_TILE3D_GET_DIM1_EDGE1(dstTile) + XAI_TILE3D_GET_DIM1_EDGE2(dstTile)))), XAI_ERR_BADARG, \
+        "pArray width parameter is not set as required");
+    }
+    extendEdges3D_F32_DWH(dstTile, pArray, frame3DSize);
+  }
+  return(XAI_ERROR_STATUS());
+}
+#endif //#if (XCHAL_HAVE_VISION_SP_VFPU == 1)
+#endif //INPUT_DATA_TYPE
+
+/*====================================================================================*/
+/*=============== END of xaiExtendEdges3D_* routines =================================*/
+/*====================================================================================*/
+#endif //if ((XCHAL_VISION_TYPE >= 6))
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_fill_tile.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_fill_tile.h
new file mode 100644
index 00000000000..dcabdd096f6
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_fill_tile.h
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2021 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+
+
+#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER_IDT)  name ## _ ## MORPH_FNAME_SPECIFIER_IDT
+
+#if INPUT_DATA_TYPE == INTEGER8BIT
+
+#undef MAKE_ARGUMENTS
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_VECTOR
+#undef MORPH_VECTORIZATION_WIDTH
+#undef MORPH_OP_STORE_IP
+#undef MORPH_OP_VAR_STORE_XP
+#undef MORPH_OP_PRIME
+#undef MORPH_OP_FLUSH
+#undef MORPH_BYTES_PER_PIXEL
+
+#define MAKE_ARGUMENTS(a, b, c)  (xai_pTile3D a, const int32_t b, xai_bool c)
+#define MAKE_NAME(name)          MAKE_NAME_IMPL(name, I8)
+#define MORPH_IDT_CHECK            XAI_CHECK_TILE3D_I8
+#define MORPH_IDT_SCALAR           int8_t
+#define MORPH_IDT_VECTOR           xb_vec2Nx8
+#define MORPH_VECTORIZATION_WIDTH  (2 * XCHAL_IVPN_SIMD_WIDTH)
+#define MORPH_OP_STORE_IP          IVP_SA2NX8_IP
+#define MORPH_OP_VAR_STORE_XP      IVP_SAV2NX8_XP
+#define MORPH_OP_PRIME             IVP_LA2NX8_PP
+#define MORPH_OP_FLUSH             IVP_SAPOS2NX8_FP
+#define MORPH_BYTES_PER_PIXEL      1
+
+#elif INPUT_DATA_TYPE == INTEGER16BIT
+
+#undef MAKE_ARGUMENTS
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_VECTOR
+#undef MORPH_VECTORIZATION_WIDTH
+#undef MORPH_OP_STORE_IP
+#undef MORPH_OP_VAR_STORE_XP
+#undef MORPH_OP_PRIME
+#undef MORPH_OP_FLUSH
+#undef MORPH_BYTES_PER_PIXEL
+
+#define MAKE_ARGUMENTS(a, b, c)  (xai_pTile3D a, const int32_t b, xai_bool c)
+#define MAKE_NAME(name)          MAKE_NAME_IMPL(name, I16)
+#define MORPH_IDT_CHECK            XAI_CHECK_TILE3D_I16
+#define MORPH_IDT_SCALAR           int16_t
+#define MORPH_IDT_VECTOR           xb_vecNx16
+#define MORPH_VECTORIZATION_WIDTH  (XCHAL_IVPN_SIMD_WIDTH)
+#define MORPH_OP_STORE_IP          IVP_SANX16_IP
+#define MORPH_OP_VAR_STORE_XP      IVP_SAVNX16_XP
+#define MORPH_OP_PRIME             IVP_LANX16_PP
+#define MORPH_OP_FLUSH             IVP_SAPOSNX16_FP
+#define MORPH_BYTES_PER_PIXEL      2
+
+#elif INPUT_DATA_TYPE == FLOAT16BIT
+
+#undef MAKE_ARGUMENTS
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_VECTOR
+#undef MORPH_VECTORIZATION_WIDTH
+#undef MORPH_OP_STORE_IP
+#undef MORPH_OP_VAR_STORE_XP
+#undef MORPH_OP_PRIME
+#undef MORPH_OP_FLUSH
+#undef MORPH_BYTES_PER_PIXEL
+
+#if (XCHAL_HAVE_VISION_HP_VFPU == 1)
+#define MAKE_ARGUMENTS(a, b, c)  (xai_pTile3D a, const xb_f16 b, xai_bool c)
+#define MAKE_NAME(name)          MAKE_NAME_IMPL(name, F16)
+#define MORPH_IDT_CHECK            XAI_CHECK_TILE3D_F16
+#define MORPH_IDT_SCALAR           xb_f16
+#define MORPH_IDT_VECTOR           xb_vecNxf16
+#define MORPH_VECTORIZATION_WIDTH  (XCHAL_IVPN_SIMD_WIDTH)
+#define MORPH_OP_STORE_IP          IVP_SANXF16_IP
+#define MORPH_OP_VAR_STORE_XP      IVP_SAVNXF16_XP
+#define MORPH_OP_PRIME             IVP_LANXF16_PP
+#define MORPH_OP_FLUSH             IVP_SAPOSNXF16_FP
+#define MORPH_BYTES_PER_PIXEL      2
+#endif
+
+#elif INPUT_DATA_TYPE == FLOAT32BIT
+
+#undef MAKE_ARGUMENTS
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_VECTOR
+#undef MORPH_VECTORIZATION_WIDTH
+#undef MORPH_OP_STORE_IP
+#undef MORPH_OP_VAR_STORE_XP
+#undef MORPH_OP_PRIME
+#undef MORPH_OP_FLUSH
+#undef MORPH_BYTES_PER_PIXEL
+
+#if (XCHAL_HAVE_VISION_SP_VFPU == 1)
+#define MAKE_ARGUMENTS(a, b, c)  (xai_pTile3D a, const float b, xai_bool c)
+#define MAKE_NAME(name)          MAKE_NAME_IMPL(name, F32)
+#define MORPH_IDT_CHECK            XAI_CHECK_TILE3D_F32
+#define MORPH_IDT_SCALAR           float
+#define MORPH_IDT_VECTOR           xb_vecN_2xf32
+#define MORPH_VECTORIZATION_WIDTH  (XCHAL_IVPN_SIMD_WIDTH / 2)
+#define MORPH_OP_STORE_IP          IVP_SAN_2XF32_IP
+#define MORPH_OP_VAR_STORE_XP      IVP_SAVN_2XF32_XP
+#define MORPH_OP_PRIME             IVP_LAN_2XF32_PP
+#define MORPH_OP_FLUSH             IVP_SAPOSN_2XF32_FP
+#define MORPH_BYTES_PER_PIXEL      4
+#endif
+#endif
+
+/**************************************************************************************/
+/*                                 MAKE_NAME(xaiFillTile3D)                            */
+/**************************************************************************************/
+
+/*******************************   xaiFillTile3D  *************************************/
+/* Description : P6 optimized generic implementation of FillTile 3D function.         */
+/*               Based on MORPH pre-processor specifiers, code implementation         */
+/*               is generated during pre-processing stage. This method implements     */
+/*               xaiFillTile3D_I8, xaiFillTile3D_I16, xaiFillTile3D_F16 and           */
+/*               xaiFillTile3D_F32 functionality.                                     */
+/* Inputs      : Constant value to fill, fill_edge_extension                          */
+/* Outputs     : XI Error Code                                                        */
+/* InOuts      : Output Tile                                                          */
+/* Assumptions : OutData is signed 8/16 bit Integer or half precision float(FP16) or  */
+/*               single precision float(FP32) based on MORPH specifier                */
+/**************************************************************************************/
+
+/****************************** xaiFillTile3D_I8 ***************************************/
+/****************************** xaiFillTile3D_I16 **************************************/
+/****************************** xaiFillTile3D_F16 **************************************/
+/****************************** xaiFillTile3D_F32 **************************************/
+
+XAI_ERR_TYPE MAKE_NAME(xaiFillTile3D) MAKE_ARGUMENTS(dstTile, value, fill_edge_extension)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(dstTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(dstTile);
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t dim1Size      = XAI_TILE3D_GET_DIM1(dstTile);
+  const int32_t dim2Size      = XAI_TILE3D_GET_DIM2(dstTile);
+  const int32_t dim1Edge1     = XAI_TILE3D_GET_DIM1_EDGE1(dstTile);
+  const int32_t dim1Edge2     = XAI_TILE3D_GET_DIM1_EDGE2(dstTile);
+  const int32_t dim2Edge1     = XAI_TILE3D_GET_DIM2_EDGE1(dstTile);
+  const int32_t dim2Edge2     = XAI_TILE3D_GET_DIM2_EDGE2(dstTile);
+  const int32_t dim3Edge1     = XAI_TILE3D_GET_DIM3_EDGE1(dstTile);
+  const int32_t dim3Edge2     = XAI_TILE3D_GET_DIM3_EDGE2(dstTile);
+  const int32_t dstDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(dstTile);
+  const int32_t dstDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(dstTile);
+  const int32_t dim3Size      = XAI_TILE3D_GET_DIM3(dstTile);
+  MORPH_IDT_SCALAR *pDst      = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(dstTile);
+
+  int32_t z, x, y;
+  /* Vectorization for xaiFillTile3D function is always done across the first dimension */
+  int32_t vectorizationWidth = MORPH_VECTORIZATION_WIDTH;
+  int32_t dim1FillSize       = dim1Size;
+  int32_t dim2FillSize       = dim2Size;
+  int32_t dim3FillSize       = dim3Size;
+  int32_t maxLoopCount;
+
+  MORPH_IDT_VECTOR* restrict pdvecOut;
+  valign vaOutData          = IVP_ZALIGN();
+  MORPH_IDT_VECTOR vecValue = value;
+
+  /* If fill_edge_extension flag is enabled update destination data pointer  */
+  /* and data fill size across all 3 dimensions.                             */
+
+  if (fill_edge_extension)
+  {
+    dim1FillSize = dim1Size + dim1Edge1 + dim1Edge2;
+    dim2FillSize = dim2Size + dim2Edge1 + dim2Edge2;
+    dim3FillSize = dim3Size + dim3Edge1 + dim3Edge2;
+    pDst         = &pDst[-dim1Edge1 + ((-dim2Edge1) * dstDataPitch1) + ((-dim3Edge1) * dstDataPitch2)];
+  }
+
+  /******************************************************************************/
+  /* The overall design approach is split into 2 parts                          */
+  /* 1. When destination tile pitch is equal to destination tile fill size.     */
+  /*    - If above condition holds good, memory location to be filled           */
+  /*      with constant value is contiguous. Hence vectorization can be         */
+  /*      utilized effectively                                                  */
+  /* 2. When destination tile pitch is greater than destination tile fill size. */
+  /*    - If above condition holds good, memory location to be filled           */
+  /*      with constant value is not contiguous. In order to do                 */
+  /*      vectorization across first dimension, destination data pointers       */
+  /*      need to be updated based on destination tile fill size and            */
+  /*      destination tile pitch                                                */
+  /******************************************************************************/
+  if (dstDataPitch1 == dim1FillSize)
+  {
+    /* Data to be filled exist in contiguous memory location with respect to */
+    /* first dimension                                                       */
+
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3FillSize;
+    maxLoopCount = dim1FillSize * dim2FillSize;
+    if (dstDataPitch2 == maxLoopCount)
+    {
+      /* Data to be filled exist in contiguous memory location with respect to */
+      /* first and second dimension                                            */
+
+      /* Update max loop counter */
+      maxLoopCount    *= dim3FillSize;
+      dim3MaxLoopCount = 1;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      /* initialize destination data pointer */
+      pdvecOut = (MORPH_IDT_VECTOR *) (pDst + (z * dstDataPitch2));
+      for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        MORPH_OP_STORE_IP(vecValue, vaOutData, pdvecOut);
+      }
+
+      MORPH_OP_VAR_STORE_XP(vecValue, vaOutData, pdvecOut,
+                            (maxLoopCount - x) * MORPH_BYTES_PER_PIXEL);
+      MORPH_OP_FLUSH(vaOutData, pdvecOut);
+    }
+  }
+  else
+  {
+    /* else block execute if destination tile pitch is */
+    /* greater than destination tile fill size         */
+    for (z = 0; z < dim3FillSize; z++) /* Loop across dim3 */
+    {
+      x = 0;
+      /* Loop across dimension 1 */
+      /* Condition check added to maximize vectorization across dimension 1*/
+      /* Loop across dim1 */
+      for (; x < (dim1FillSize - 3 * vectorizationWidth); x += 4 * vectorizationWidth)
+      {
+        /* initialize destination data pointer */
+        MORPH_IDT_SCALAR *pDst1 = pDst + x + (z * dstDataPitch2);
+        for (y = 0; y < dim2FillSize; y++) /* Loop across dim2 */
+        {
+          pdvecOut = (MORPH_IDT_VECTOR *) (pDst1 + (y * dstDataPitch1));
+          MORPH_OP_STORE_IP(vecValue, vaOutData, pdvecOut);
+          MORPH_OP_STORE_IP(vecValue, vaOutData, pdvecOut);
+          MORPH_OP_STORE_IP(vecValue, vaOutData, pdvecOut);
+          MORPH_OP_VAR_STORE_XP(vecValue, vaOutData, pdvecOut,
+                                (dim1FillSize - (x + 3 * vectorizationWidth)) * MORPH_BYTES_PER_PIXEL);
+          MORPH_OP_FLUSH(vaOutData, pdvecOut);
+        }
+      }
+      if (x < (dim1FillSize - 2 * vectorizationWidth))
+      {
+        /* initialize destination data pointer */
+        MORPH_IDT_SCALAR *pDst1 = pDst + x + (z * dstDataPitch2);
+        for (y = 0; y < dim2FillSize; y++) /* Loop across dim2 */
+        {
+          pdvecOut = (MORPH_IDT_VECTOR *) (pDst1 + (y * dstDataPitch1));
+          MORPH_OP_STORE_IP(vecValue, vaOutData, pdvecOut);
+          MORPH_OP_STORE_IP(vecValue, vaOutData, pdvecOut);
+          MORPH_OP_VAR_STORE_XP(vecValue, vaOutData, pdvecOut,
+                                (dim1FillSize - (x + 2 * vectorizationWidth)) * MORPH_BYTES_PER_PIXEL);
+          MORPH_OP_FLUSH(vaOutData, pdvecOut);
+        }
+      }
+      else if (x < (dim1FillSize - vectorizationWidth))
+      {
+        /* initialize destination data pointer */
+        MORPH_IDT_SCALAR *pDst1 = pDst + x + (z * dstDataPitch2);
+        for (y = 0; y < dim2FillSize; y++) /* Loop across dim2 */
+        {
+          pdvecOut = (MORPH_IDT_VECTOR *) (pDst1 + (y * dstDataPitch1));
+          MORPH_OP_STORE_IP(vecValue, vaOutData, pdvecOut);
+          MORPH_OP_VAR_STORE_XP(vecValue, vaOutData, pdvecOut,
+                                (dim1FillSize - (x + vectorizationWidth)) * MORPH_BYTES_PER_PIXEL);
+          MORPH_OP_FLUSH(vaOutData, pdvecOut);
+        }
+      }
+      else if (x < dim1FillSize)
+      {
+        /* initialize destination data pointer */
+        MORPH_IDT_SCALAR *pDst1 = pDst + x + (z * dstDataPitch2);
+        for (y = 0; y < dim2FillSize; y++) /* Loop across dim2 */
+        {
+          pdvecOut = (MORPH_IDT_VECTOR *) (pDst1 + (y * dstDataPitch1));
+          MORPH_OP_VAR_STORE_XP(vecValue, vaOutData, pdvecOut,
+                                (dim1FillSize - x) * MORPH_BYTES_PER_PIXEL);
+          MORPH_OP_FLUSH(vaOutData, pdvecOut);
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+#endif //if ((XCHAL_VISION_TYPE >= 6))
diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_helper.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_helper.c
new file mode 100644
index 00000000000..e024b12440a
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_helper.c
@@ -0,0 +1,2141 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn.h"
+#include "xai_intrin.h"
+
+#if ((XCHAL_VISION_TYPE >= 6))
+
+#define S24_MIN  (-(((int32_t) 1) << 23))
+#define S24_MAX  ((((int32_t) 1) << 23) - 1)
+
+/****************************************************************************/
+/* Description : Implementation for getting the sub-kernel and              */
+/*               super kernel related information.                          */
+/*               If getNumKernelsFlag is passed as 1, function returns the  */
+/*               number of sub-kernels.                                     */
+/*               If getNumKernelsFlag is passed as 0, function returns the  */
+/*               tile dimension for the sub-kernels.                        */
+/* Inputs      : Input Coeff Tile, stride along X & Y directions,           */
+/*               getNumKernelsFlag                                          */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Array of Coeff Sub, numSubKernels.                         */
+/* Assumptions : Coeff is in WHDN format                                    */
+/****************************************************************************/
+XAI_ERR_TYPE xaiDeConvGetDim4D_WHDN(const xai_pTile4D coeffTile,
+                                    xai_pTile4D subCoeffInfo[],
+                                    uint16_t *numSubKernels,
+                                    const uint8_t strideX,
+                                    const uint8_t strideY,
+                                    const uint8_t getNumKernelsFlag)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    if (getNumKernelsFlag)
+    {
+      XAI_CHECK_POINTER(numSubKernels);
+    }
+    XAI_CHECK_ERROR((strideX > 0) && (strideY > 0),                                          \
+                    XAI_ERR_BADARG, "strideX = %hhu, strideY = %hhu\nStride has to be >= 1", \
+                    strideX, strideY);
+  }
+  if (getNumKernelsFlag)
+  {
+    *numSubKernels = strideX * strideY;
+    return(XAI_ERROR_STATUS());
+  }
+
+  int32_t kIdx, kIdy;
+  int32_t kernelIdx;
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    XAI_CHECK_TILE4D(coeffTile);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN);
+    for (kIdy = 0; kIdy < strideY; kIdy++)
+    {
+      for (kIdx = 0; kIdx < strideX; kIdx++)
+      {
+        kernelIdx = kIdy * strideX + kIdx;
+        XAI_CHECK_POINTER(subCoeffInfo[kernelIdx]);
+      }
+    }
+  }
+
+  const int32_t kWidth  = XAI_TILE4D_GET_DIM1(coeffTile);
+  const int32_t kHeight = XAI_TILE4D_GET_DIM2(coeffTile);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    XAI_CHECK_ERROR((strideX <= kWidth) && (strideY <= kHeight), XAI_ERR_BADARG,                                                                 \
+                    "\nstrideX = %hhu, kWidth = %d and strideY = %hhu, kHeight = %d\nStride should be less than corresponding Kernel Dimension", \
+                    strideX, kWidth, strideY, kHeight);
+  }
+
+  for (kIdy = 0; kIdy < strideY; kIdy++)
+  {
+    for (kIdx = 0; kIdx < strideX; kIdx++)
+    {
+      kernelIdx = kIdy * strideX + kIdx;
+
+      XAI_TILE4D_SET_DIM1(subCoeffInfo[kernelIdx], \
+                          (kWidth + strideX - kIdx - 1) / strideX);
+      XAI_TILE4D_SET_DIM2(subCoeffInfo[kernelIdx], \
+                          (kHeight + strideY - kIdy - 1) / strideY);
+      XAI_TILE4D_SET_DIM3(subCoeffInfo[kernelIdx], \
+                          XAI_TILE4D_GET_DIM4(coeffTile));
+      XAI_TILE4D_SET_DIM4(subCoeffInfo[kernelIdx], \
+                          XAI_TILE4D_GET_DIM3(coeffTile));
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************************************************************/
+/* Description : Implementation for getting the sub-kernel                  */
+/*                related information.                                      */
+/*               If getNumKernelsFlag is passed as 1, function returns the  */
+/*               number of sub-kernels.                                     */
+/*               If getNumKernelsFlag is passed as 0, function returns the  */
+/*               tile dimension for the sub-kernels.                        */
+/* Inputs      : Input Coeff Tile, stride along X & Y directions,           */
+/*               getNumKernelsFlag                                          */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Array of Coeff Sub, numSubKernels.                         */
+/* Assumptions : Coeff is in WHD format                                     */
+/****************************************************************************/
+XAI_ERR_TYPE xaiDeConvGetDim3D_WHD(const xai_pTile3D coeffTile,
+                                   xai_pTile3D subCoeffInfo[],
+                                   uint16_t *numSubKernels,
+                                   const uint8_t strideX,
+                                   const uint8_t strideY,
+                                   const uint8_t getNumKernelsFlag)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    if (getNumKernelsFlag)
+    {
+      XAI_CHECK_POINTER(numSubKernels);
+    }
+    XAI_CHECK_ERROR((strideX > 0) && (strideY > 0),                                          \
+                    XAI_ERR_BADARG, "strideX = %hhu, strideY = %hhu\nStride has to be >= 1", \
+                    strideX, strideY);
+  }
+  if (getNumKernelsFlag)
+  {
+    *numSubKernels = strideX * strideY;
+    return(XAI_ERROR_STATUS());
+  }
+
+  int32_t kIdx, kIdy;
+  int32_t kernelIdx;
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    XAI_CHECK_TILE3D(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(coeffTile, XAI_WHD);
+    for (kIdy = 0; kIdy < strideY; kIdy++)
+    {
+      for (kIdx = 0; kIdx < strideX; kIdx++)
+      {
+        kernelIdx = kIdy * strideX + kIdx;
+        XAI_CHECK_POINTER(subCoeffInfo[kernelIdx]);
+      }
+    }
+  }
+
+  const int32_t kWidth  = XAI_TILE3D_GET_DIM1(coeffTile);
+  const int32_t kHeight = XAI_TILE3D_GET_DIM2(coeffTile);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    XAI_CHECK_ERROR((strideX <= kWidth) && (strideY <= kHeight),                                                                                 \
+                    XAI_ERR_BADARG,                                                                                                              \
+                    "\nstrideX = %hhu, kWidth = %d and strideY = %hhu, kHeight = %d\nStride should be less than corresponding Kernel Dimension", \
+                    strideX, kWidth, strideY, kHeight);
+  }
+
+  for (kIdy = 0; kIdy < strideY; kIdy++)
+  {
+    for (kIdx = 0; kIdx < strideX; kIdx++)
+    {
+      kernelIdx = kIdy * strideX + kIdx;
+
+      XAI_TILE3D_SET_DIM1(subCoeffInfo[kernelIdx], \
+                          (kWidth + strideX - kIdx - 1) / strideX);
+      XAI_TILE3D_SET_DIM2(subCoeffInfo[kernelIdx], \
+                          (kHeight + strideY - kIdy - 1) / strideY);
+      XAI_TILE3D_SET_DIM3(subCoeffInfo[kernelIdx], \
+                          XAI_TILE4D_GET_DIM3(coeffTile));
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************************************************************/
+/* Description : Implementation for coefficient reordering                  */
+/*               The functions does the following:                          */
+/*               - Convert from WHDN->WHND                                  */
+/*               - Flips the coefficients across width and height which is  */
+/*                 controlled by transposeCoeffsFlag.                       */
+/*               - Breaks the kernel into sub-kernels.                      */
+/* Inputs      : Input Coeff Tile, CNN convolution params structure,        */
+/*               transposeCoeffsFlag                                        */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Array of Coeff Sub & Super Tiles                           */
+/* Assumptions : CoeffData is S8/U8                                         */
+/*               Coeff is in WHDN format                                    */
+/****************************************************************************/
+XAI_ERR_TYPE xaiDeConvReOrder4D_I8_WHDN(const xai_pTile4D inTile,
+                                        xai_pTile4D subCoeffs[],
+                                        const xai_cnn_conv_params *param,
+                                        const uint8_t transposeCoeffsFlag)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE4D_I8(inTile);
+    XAI_CHECK_TILE4D_DATA_ORDER(inTile, XAI_WHDN);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_POINTER(subCoeffs);
+    XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) >= 1) &&                                                 \
+                     (XAI_CNN_CONV_GET_STRIDEX(param) <= XAI_TILE4D_GET_DIM1(inTile))) &&                      \
+                    ((XAI_CNN_CONV_GET_STRIDEY(param) >= 1) &&                                                 \
+                     (XAI_CNN_CONV_GET_STRIDEY(param) <= XAI_TILE4D_GET_DIM2(inTile))), XAI_ERR_BADARG,        \
+                    "StrideX = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Width) \
+      \nStrideY = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Height)", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_TILE4D_GET_DIM1(inTile),                              \
+                    XAI_CNN_CONV_GET_STRIDEY(param), XAI_TILE4D_GET_DIM2(inTile));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) == 1), \
+                    XAI_ERR_BADARG, "\nDilation parameter is %hhu\nDilation parameter should be equal to 1", XAI_CNN_CONV_GET_DILATION(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                          \
+                    XAI_ERR_BADARG, "\nDilation along width is %hhu and dilation along height is %hhu are not same", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+  }
+
+  int32_t kIdx, kIdy;
+  int32_t kernelIdx;
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    for (kIdy = 0; kIdy < XAI_CNN_CONV_GET_STRIDEY(param); kIdy++)
+    {
+      for (kIdx = 0; kIdx < XAI_CNN_CONV_GET_STRIDEX(param); kIdx++)
+      {
+        kernelIdx = kIdy * XAI_CNN_CONV_GET_STRIDEX(param) + kIdx;
+        XAI_CHECK_TILE4D_I8(subCoeffs[kernelIdx]);
+        XAI_CHECK_TILE4D_DATA_ORDER(subCoeffs[kernelIdx], XAI_WHDN);
+      }
+    }
+  }
+  int8_t *pInCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(inTile);
+
+  const int32_t kWidth   = XAI_TILE4D_GET_DIM1(inTile); /* W */
+  const int32_t kHeight  = XAI_TILE4D_GET_DIM2(inTile); /* H */
+  const int32_t numInCh  = XAI_TILE4D_GET_DIM3(inTile); /* D */
+  const int32_t numOutCh = XAI_TILE4D_GET_DIM4(inTile); /* N */
+
+  const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param);
+
+  int32_t inCoeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(inTile);
+  int32_t inCoeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(inTile);
+  int32_t inCoeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(inTile);
+
+  int32_t kx, ky, inCh, outCh, inIdx, outIdx = 0;
+  int8_t *pSubCoeff;
+  int32_t kxStart, kyStart;
+
+
+  if (transposeCoeffsFlag)
+  {
+    /* Conversion from WHDN -> WHND,                       */
+    /* transposing of kernels and formation of sub-kernels */
+    for (kIdy = 0; kIdy < strideY; kIdy++)
+    {
+      for (kIdx = 0; kIdx < strideX; kIdx++)
+      {
+        kernelIdx = kIdy * strideX + kIdx;
+        int8_t *pSubCoeff = \
+          (int8_t *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx]);
+
+        outIdx  = 0;
+        kyStart = kHeight - 1 - ((kHeight + strideY - kIdy - 1) % strideY);
+        kxStart = kWidth - 1 - ((kWidth + strideX - kIdx - 1) % strideX);
+
+        for (inCh = 0; inCh < numInCh; inCh++)            /* D */
+        {
+          for (outCh = 0; outCh < numOutCh; outCh++)      /* N */
+          {
+            for (ky = kyStart; ky >= 0; ky -= strideY)    /* H */
+            {
+              for (kx = kxStart; kx >= 0; kx -= strideX)  /* W */
+              {
+                inIdx = outCh * inCoeffPitch3 + inCh * inCoeffPitch2 + \
+                        ky * inCoeffPitch1 + kx;
+                pSubCoeff[outIdx++] = pInCoeff[inIdx];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  else
+  {
+    /* Conversion from WHDN -> WHND and formation of sub-kernels */
+    for (kIdy = 0; kIdy < strideY; kIdy++)
+    {
+      for (kIdx = 0; kIdx < strideX; kIdx++)
+      {
+        kernelIdx = kIdy * strideX + kIdx;
+        pSubCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx]);
+
+        outIdx  = 0;
+        kyStart = ((kHeight + strideY - kIdy - 1) % strideY);
+        kxStart = ((kWidth + strideX - kIdx - 1) % strideX);
+
+        for (inCh = 0; inCh < numInCh; inCh++)                 /* D */
+        {
+          for (outCh = 0; outCh < numOutCh; outCh++)           /* N */
+          {
+            for (ky = kyStart; ky < kHeight; ky += strideY)    /* H */
+            {
+              for (kx = kxStart; kx < kWidth; kx += strideX)   /* W */
+              {
+                inIdx = outCh * inCoeffPitch3 + inCh * inCoeffPitch2 + \
+                        ky * inCoeffPitch1 + kx;
+                pSubCoeff[outIdx++] = pInCoeff[inIdx];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************************************************************/
+/* Description : Implementation for coefficient reordering                  */
+/*               The functions does the following:                          */
+/*               - Flips the coefficients across width and height which is  */
+/*                 controlled by transposeCoeffsFlag.                       */
+/*               - Breaks the kernel into sub-kernels.                      */
+/* Inputs      : Input Coeff Tile, CNN convolution params structure,        */
+/*               transposeCoeffsFlag                                        */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Array of Coeff Sub & Super Tiles                           */
+/* Assumptions : CoeffData is S8/U8                                         */
+/*               Coeff is in WHD format                                     */
+/****************************************************************************/
+XAI_ERR_TYPE xaiDeConvReOrder3D_I8_WHD(const xai_pTile3D inTile,
+                                       xai_pTile3D subCoeffs[],
+                                       const xai_cnn_depthwiseDilatedConv_params *param,
+                                       const uint8_t transposeCoeffsFlag)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_I8(inTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_POINTER(subCoeffs);
+    XAI_CHECK_ERROR(((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param) >= 1) &&                                          \
+                     (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param) <= XAI_TILE3D_GET_DIM1(inTile))) &&               \
+                    ((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param) >= 1) &&                                          \
+                     (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param) <= XAI_TILE3D_GET_DIM2(inTile))), XAI_ERR_BADARG, \
+                    "StrideX = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Width) \
+            \nStrideY = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Height)",      \
+                    XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param), XAI_TILE3D_GET_DIM1(inTile),                       \
+                    XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param), XAI_TILE3D_GET_DIM2(inTile));
+    XAI_CHECK_ERROR((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param) == 1),                     \
+                    XAI_ERR_BADARG, "\nDilation is %hhu\nDilation parameter should be equal to 1", \
+                    XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param));
+    XAI_CHECK_ERROR(XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(param) == XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(param), \
+                    XAI_ERR_BADARG, "\nDilation along width is %hhu and dilation along height is %hhu are not same",            \
+                    XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(param), XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(param));
+  }
+
+  int32_t kIdx, kIdy;
+  int32_t kernelIdx;
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    for (kIdy = 0; kIdy < XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param); kIdy++)
+    {
+      for (kIdx = 0; kIdx < XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param); kIdx++)
+      {
+        kernelIdx = kIdy * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param) + kIdx;
+        XAI_CHECK_TILE3D_I8(subCoeffs[kernelIdx]);
+        XAI_CHECK_TILE3D_DATA_ORDER(subCoeffs[kernelIdx], XAI_WHD);
+      }
+    }
+  }
+  int8_t *pInCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(inTile);
+
+  const int32_t kWidth  = XAI_TILE4D_GET_DIM1(inTile);    /* W */
+  const int32_t kHeight = XAI_TILE4D_GET_DIM2(inTile);    /* H */
+  const int32_t numInCh = XAI_TILE4D_GET_DIM3(inTile);    /* D */
+
+
+  const uint8_t strideX = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param);
+  const uint8_t strideY = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param);
+
+  int32_t inCoeffPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  int32_t inCoeffPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  int32_t kx, ky, inCh, inIdx, outIdx = 0;
+  int8_t *pSubCoeff;
+  int32_t kxStart, kyStart;
+
+
+  if (transposeCoeffsFlag)
+  {
+    /* transposing of kernels and formation of sub-kernels */
+    for (kIdy = 0; kIdy < strideY; kIdy++)
+    {
+      for (kIdx = 0; kIdx < strideX; kIdx++)
+      {
+        kernelIdx = kIdy * strideX + kIdx;
+        int8_t *pSubCoeff = \
+          (int8_t *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx]);
+
+        outIdx  = 0;
+        kyStart = kHeight - 1 - ((kHeight + strideY - kIdy - 1) % strideY);
+        kxStart = kWidth - 1 - ((kWidth + strideX - kIdx - 1) % strideX);
+
+        for (inCh = 0; inCh < numInCh; inCh++)            /* D */
+        {
+          for (ky = kyStart; ky >= 0; ky -= strideY)    /* H */
+          {
+            for (kx = kxStart; kx >= 0; kx -= strideX)  /* W */
+            {
+              inIdx = inCh * inCoeffPitch2 + \
+                      ky * inCoeffPitch1 + kx;
+              pSubCoeff[outIdx++] = pInCoeff[inIdx];
+            }
+          }
+        }
+      }
+    }
+  }
+  else
+  {
+    for (kIdy = 0; kIdy < strideY; kIdy++)
+    {
+      for (kIdx = 0; kIdx < strideX; kIdx++)
+      {
+        kernelIdx = kIdy * strideX + kIdx;
+        pSubCoeff = (int8_t *) XAI_TILE3D_GET_DATA_PTR(subCoeffs[kernelIdx]);
+
+        outIdx  = 0;
+        kyStart = ((kHeight + strideY - kIdy - 1) % strideY);
+        kxStart = ((kWidth + strideX - kIdx - 1) % strideX);
+
+        for (inCh = 0; inCh < numInCh; inCh++)                 /* D */
+        {
+          for (ky = kyStart; ky < kHeight; ky += strideY)    /* H */
+          {
+            for (kx = kxStart; kx < kWidth; kx += strideX)   /* W */
+            {
+              inIdx = inCh * inCoeffPitch2 + \
+                      ky * inCoeffPitch1 + kx;
+              pSubCoeff[outIdx++] = pInCoeff[inIdx];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************************************************************/
+/* Description : Implementation for extending the bias array in             */
+/*               case of MOD deconvolution using superkernels.              */
+/* Inputs      : Input Bias array,                                          */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Bias array                                          */
+/****************************************************************************/
+XAI_ERR_TYPE xaiBiasExtend_S32_MOD(const xai_pArray inBiasArray,
+                                   xai_pArray outBiasArray)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_ARRAY_S32(inBiasArray);
+    XAI_CHECK_ARRAY_S32(outBiasArray);
+  }
+
+  int32_t inWidth  = XAI_ARRAY_GET_WIDTH(inBiasArray);
+  int32_t outWidth = XAI_ARRAY_GET_WIDTH(outBiasArray);
+  int32_t strideX  = outWidth / inWidth;
+
+  int32_t* pInBias  = (int32_t *) XAI_ARRAY_GET_DATA_PTR(inBiasArray);
+  int32_t* pOutBias = (int32_t *) XAI_ARRAY_GET_DATA_PTR(outBiasArray);
+
+  int32_t numX, inW;
+  for (numX = 0; numX < strideX; numX++)
+  {
+    for (inW = 0; inW < inWidth; inW++)
+    {
+      pOutBias[inW + inWidth * numX] = pInBias[inW];
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************/
+/* Description : Implementation for extending the outputscale array          */
+/*               in case of MOD deconvolution using superkernels.            */
+/* Inputs      : outputScale array,                                          */
+/* Outputs     : XI Error Code                                               */
+/* InOuts      : extended outputScale array                                  */
+/*****************************************************************************/
+XAI_ERR_TYPE xaiOutScaleExtend_U16_MOD(const xai_pArray outScaleArray,
+                                       xai_pArray extendedOutScaleArray)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_ARRAY_U16(outScaleArray);
+    XAI_CHECK_ARRAY_U16(extendedOutScaleArray);
+  }
+
+  int32_t inWidth  = XAI_ARRAY_GET_WIDTH(outScaleArray);
+  int32_t outWidth = XAI_ARRAY_GET_WIDTH(extendedOutScaleArray);
+  int32_t strideX  = outWidth / inWidth;
+
+  uint16_t* pInScale  = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outScaleArray);
+  uint16_t* pOutScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(extendedOutScaleArray);
+
+  int32_t numX, inW;
+  for (numX = 0; numX < strideX; numX++)
+  {
+    for (inW = 0; inW < inWidth; inW++)
+    {
+      pOutScale[inW + inWidth * numX] = pInScale[inW];
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************************************************************/
+/* Description : Implementation for getting the sub-kernel and              */
+/*               super kernel related information.                          */
+/*               If getNumKernelsFlag is passed as 1, function returns the  */
+/*               number of sub-kernels and super kernels.                   */
+/*               If getNumKernelsFlag is passed as 0, function returns the  */
+/*               tile dimension for the sub-kernels and super kernels.      */
+/* Inputs      : Input Coeff Tile, stride along X & Y directions,           */
+/*               getNumKernelsFlag                                          */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Array of Coeff Sub & Super Tiles, numSubKernels and        */
+/*               numSuperKernels                                            */
+/* Assumptions : Coeff is in NDWH format                                    */
+/****************************************************************************/
+XAI_ERR_TYPE xaiDeConvGetDim4D_NDWH(const xai_pTile4D coeffTile,
+                                    xai_pTile4D subCoeffInfo[],
+                                    xai_pTile4D superCoeffInfo[],
+                                    uint16_t *numSubKernels,
+                                    uint16_t *numSuperKernels,
+                                    const uint8_t strideX,
+                                    const uint8_t strideY,
+                                    const uint8_t getNumKernelsFlag)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    if (getNumKernelsFlag)
+    {
+      XAI_CHECK_POINTER(numSubKernels);
+      XAI_CHECK_POINTER(numSuperKernels);
+    }
+    XAI_CHECK_ERROR((strideX > 0) && (strideY > 0),                                          \
+                    XAI_ERR_BADARG, "strideX = %hhu, strideY = %hhu\nStride has to be >= 1", \
+                    strideX, strideY);
+  }
+  if (getNumKernelsFlag)
+  {
+    *numSubKernels   = strideX * strideY;
+    *numSuperKernels = strideY;
+    return(XAI_ERROR_STATUS());
+  }
+
+  int32_t kIdx, kIdy;
+  int32_t kernelIdx;
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    XAI_CHECK_TILE4D(coeffTile);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_POINTER(subCoeffInfo);
+    XAI_CHECK_POINTER(superCoeffInfo);
+    for (kIdy = 0; kIdy < strideY; kIdy++)
+    {
+      for (kIdx = 0; kIdx < strideX; kIdx++)
+      {
+        kernelIdx = kIdy * strideX + kIdx;
+        XAI_CHECK_POINTER(subCoeffInfo[kernelIdx]);
+      }
+      XAI_CHECK_POINTER(superCoeffInfo[kIdy]);
+    }
+  }
+
+  const int32_t kWidth  = XAI_TILE4D_GET_DIM3(coeffTile);
+  const int32_t kHeight = XAI_TILE4D_GET_DIM4(coeffTile);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    XAI_CHECK_ERROR((strideX <= kWidth) && (strideY <= kHeight), XAI_ERR_BADARG,     \
+                    "StrideX = %hhu, value must be less than or equal to %d(kernel Width) \
+            \nStrideY = %hhu, value must be ess than or equal to %d(kernel Height)", \
+                    strideX, kWidth, strideY, kHeight);
+  }
+
+  for (kIdy = 0; kIdy < strideY; kIdy++)
+  {
+    for (kIdx = 0; kIdx < strideX; kIdx++)
+    {
+      kernelIdx = kIdy * strideX + kIdx;
+
+      XAI_TILE4D_SET_DIM1(subCoeffInfo[kernelIdx], XAI_TILE4D_GET_DIM2(coeffTile));
+      XAI_TILE4D_SET_DIM2(subCoeffInfo[kernelIdx], XAI_TILE4D_GET_DIM1(coeffTile));
+      XAI_TILE4D_SET_DIM3(subCoeffInfo[kernelIdx], (kWidth + strideX - kIdx - 1) / strideX);
+      XAI_TILE4D_SET_DIM4(subCoeffInfo[kernelIdx], (kHeight + strideY - kIdy - 1) / strideY);
+    }
+    XAI_TILE4D_SET_DIM1(superCoeffInfo[kIdy], XAI_TILE4D_GET_DIM1(subCoeffInfo[kIdy * strideX]) * strideX);
+    XAI_TILE4D_SET_DIM2(superCoeffInfo[kIdy], XAI_TILE4D_GET_DIM2(subCoeffInfo[kIdy * strideX]));
+    XAI_TILE4D_SET_DIM3(superCoeffInfo[kIdy], XAI_TILE4D_GET_DIM3(subCoeffInfo[kIdy * strideX]));
+    XAI_TILE4D_SET_DIM4(superCoeffInfo[kIdy], XAI_TILE4D_GET_DIM4(subCoeffInfo[kIdy * strideX]));
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************************************************************/
+/* Description : Implementation for getting the sub-kernel                  */
+/*               related information.                                       */
+/*               If getNumKernelsFlag is passed as 1, function returns the  */
+/*               number of sub-kernels .                                    */
+/*               If getNumKernelsFlag is passed as 0, function returns the  */
+/*               tile dimension for the sub-kernels .                       */
+/* Inputs      : Input Coeff Tile, stride along X & Y directions,           */
+/*               getNumKernelsFlag                                          */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Array of Coeff Sub Tiles and numSubKernels                 */
+/* Assumptions : Coeff is in DWH format                                     */
+/****************************************************************************/
+XAI_ERR_TYPE xaiDeConvGetDim3D_DWH(const xai_pTile3D coeffTile,
+                                   xai_pTile3D subCoeffInfo[],
+                                   uint16_t *numSubKernels,
+                                   const uint8_t strideX,
+                                   const uint8_t strideY,
+                                   const uint8_t getNumKernelsFlag)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    if (getNumKernelsFlag)
+    {
+      XAI_CHECK_POINTER(numSubKernels);
+    }
+    XAI_CHECK_ERROR((strideX > 0) && (strideY > 0), XAI_ERR_BADARG,          \
+                    "strideX = %hhu, strideY = %hhu\nStride has to be >= 1", \
+                    strideX, strideY);
+  }
+  if (getNumKernelsFlag)
+  {
+    *numSubKernels = strideX * strideY;
+    return(XAI_ERROR_STATUS());
+  }
+
+  int32_t kIdx, kIdy;
+  int32_t kernelIdx;
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    XAI_CHECK_TILE3D(coeffTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(coeffTile, XAI_DWH);
+    XAI_CHECK_POINTER(subCoeffInfo);
+    for (kIdy = 0; kIdy < strideY; kIdy++)
+    {
+      for (kIdx = 0; kIdx < strideX; kIdx++)
+      {
+        kernelIdx = kIdy * strideX + kIdx;
+        XAI_CHECK_POINTER(subCoeffInfo[kernelIdx]);
+      }
+    }
+  }
+
+  const int32_t kWidth  = XAI_TILE3D_GET_DIM2(coeffTile);
+  const int32_t kHeight = XAI_TILE3D_GET_DIM3(coeffTile);
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    XAI_CHECK_ERROR((strideX <= kWidth) && (strideY <= kHeight), XAI_ERR_BADARG, \
+                    "StrideX = %hhu, value must be less than or equal to %d(kernel Width) \
+       \nStrideY = %hhu, value must be ess than or equal to %d(kernel Height)",  \
+                    strideX, kWidth, strideY, kHeight);
+  }
+
+  for (kIdy = 0; kIdy < strideY; kIdy++)
+  {
+    for (kIdx = 0; kIdx < strideX; kIdx++)
+    {
+      kernelIdx = kIdy * strideX + kIdx;
+
+      XAI_TILE3D_SET_DIM1(subCoeffInfo[kernelIdx], XAI_TILE3D_GET_DIM1(coeffTile));
+      XAI_TILE3D_SET_DIM2(subCoeffInfo[kernelIdx], (kWidth + strideX - kIdx - 1) / strideX);
+      XAI_TILE3D_SET_DIM3(subCoeffInfo[kernelIdx], (kHeight + strideY - kIdy - 1) / strideY);
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************************************************************/
+/* Description : Implementation for coefficient reordering                  */
+/*               The functions does the following:                          */
+/*               - Convert from NDWH->DNWH                                  */
+/*               - Flips the coefficients across width and height which is  */
+/*                 controlled by transposeCoeffsFlag.                       */
+/*               - Breaks the kernel into sub-kernels.                      */
+/*               - Stacks sub-kernels to form super kernels.                */
+/* Inputs      : Input Coeff Tile, CNN convolution params structure,        */
+/*               transposeCoeffsFlag                                        */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Array of Coeff Sub & Super Tiles                           */
+/* Assumptions : CoeffData is S8/U8                                         */
+/*               Coeff is in NDWH format                                    */
+/****************************************************************************/
+XAI_ERR_TYPE xaiDeConvReOrder4D_I8_NDWH(const xai_pTile4D inTile,
+                                        xai_pTile4D subCoeffs[],
+                                        xai_pTile4D superCoeffs[],
+                                        const xai_cnn_conv_params *param,
+                                        const uint8_t transposeCoeffsFlag)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE4D_I8(inTile);
+    XAI_CHECK_TILE4D_DATA_ORDER(inTile, XAI_NDWH);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_POINTER(subCoeffs);
+    XAI_CHECK_POINTER(superCoeffs);
+    XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) >= 1) &&                                                  \
+                     (XAI_CNN_CONV_GET_STRIDEX(param) <= XAI_TILE4D_GET_DIM3(inTile))) &&                       \
+                    ((XAI_CNN_CONV_GET_STRIDEY(param) >= 1) &&                                                  \
+                     (XAI_CNN_CONV_GET_STRIDEY(param) <= XAI_TILE4D_GET_DIM4(inTile))), XAI_ERR_BADARG,         \
+                    "StrideX = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Width) \
+       \nStrideY = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Height)", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_TILE4D_GET_DIM3(inTile),                               \
+                    XAI_CNN_CONV_GET_STRIDEY(param), XAI_TILE4D_GET_DIM4(inTile));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) == 1), \
+                    XAI_ERR_BADARG, "\nDilation is %hhu\nDilation parameter should be equal to 1", XAI_CNN_CONV_GET_DILATION(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                          \
+                    XAI_ERR_BADARG, "\nDilation along width is %hhu and dilation along height is %hhu are not same", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+  }
+
+  int32_t kIdx, kIdy;
+  int32_t kernelIdx;
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    for (kIdy = 0; kIdy < XAI_CNN_CONV_GET_STRIDEY(param); kIdy++)
+    {
+      for (kIdx = 0; kIdx < XAI_CNN_CONV_GET_STRIDEX(param); kIdx++)
+      {
+        kernelIdx = kIdy * XAI_CNN_CONV_GET_STRIDEX(param) + kIdx;
+        XAI_CHECK_TILE4D_I8(subCoeffs[kernelIdx]);
+        XAI_CHECK_TILE4D_DATA_ORDER(subCoeffs[kernelIdx], XAI_NDWH);
+      }
+      XAI_CHECK_TILE4D_I8(superCoeffs[kIdy]);
+      XAI_CHECK_TILE4D_DATA_ORDER(superCoeffs[kIdy], XAI_NDWH);
+    }
+  }
+
+  int8_t *pInCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(inTile);
+
+  const int32_t numOutCh = XAI_TILE4D_GET_DIM1(inTile); /* N */
+  const int32_t numInCh  = XAI_TILE4D_GET_DIM2(inTile); /* D */
+  const int32_t kWidth   = XAI_TILE4D_GET_DIM3(inTile); /* W */
+  const int32_t kHeight  = XAI_TILE4D_GET_DIM4(inTile); /* H */
+
+  const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param);
+
+  int32_t inCoeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(inTile);
+  int32_t inCoeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(inTile);
+  int32_t inCoeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(inTile);
+
+  int32_t kx, ky, inCh, outCh, inIdx, outIdx = 0;
+  int8_t *pSuperCoeff;
+  int8_t *pSubCoeff;
+  int32_t subKPitch1, subKPitch2, subKPitch3;
+  int32_t superKPitch1, superKPitch2;
+  int32_t kW, kH, subkW;
+  int32_t numInChSubCoeff;
+  int32_t subKIdx;
+
+  int32_t kxStart, kyStart;
+
+  if (transposeCoeffsFlag)
+  {
+    /* Conversion from NDWH -> DNWH,                       */
+    /* transposing of kernels and formation of sub-kernels */
+    for (kIdy = 0; kIdy < strideY; kIdy++)
+    {
+      for (kIdx = 0; kIdx < strideX; kIdx++)
+      {
+        kernelIdx = kIdy * strideX + kIdx;
+        int8_t *pSubCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx]);
+
+        outIdx  = 0;
+        kyStart = kHeight - 1 - ((kHeight + strideY - kIdy - 1) % strideY);
+
+        for (ky = kyStart; ky >= 0; ky -= strideY)          /* H */
+        {
+          kxStart = kWidth - 1 - ((kWidth + strideX - kIdx - 1) % strideX);
+
+          for (kx = kxStart; kx >= 0; kx -= strideX)        /* W */
+          {
+            for (outCh = 0; outCh < numOutCh; outCh++)      /* N */
+            {
+              for (inCh = 0; inCh < numInCh; inCh++)        /* D */
+              {
+                inIdx = ky * inCoeffPitch3 + kx * inCoeffPitch2 + \
+                        inCh * inCoeffPitch1 + outCh;
+                pSubCoeff[outIdx++] = pInCoeff[inIdx];
+              }
+              /* For stride alignment */
+              outIdx += (outIdx % (2 * XCHAL_IVPN_SIMD_WIDTH)) ? ((2 * XCHAL_IVPN_SIMD_WIDTH) - (outIdx % (2 * XCHAL_IVPN_SIMD_WIDTH))) : 0;
+            }
+          }
+        }
+      }
+    }
+  }
+  else
+  {
+    /* Conversion from NDWH -> DNWH and formation of sub-kernels */
+    for (kIdy = 0; kIdy < strideY; kIdy++)
+    {
+      for (kIdx = 0; kIdx < strideX; kIdx++)
+      {
+        kernelIdx = kIdy * strideX + kIdx;
+        int8_t *pSubCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx]);
+
+        outIdx  = 0;
+        kyStart = ((kHeight + strideY - kIdy - 1) % strideY);
+
+        for (ky = kyStart; ky < kHeight; ky += strideY)          /* H */
+        {
+          kxStart = ((kWidth + strideX - kIdx - 1) % strideX);
+
+          for (kx = kxStart; kx < kWidth; kx += strideX)         /* W */
+          {
+            for (outCh = 0; outCh < numOutCh; outCh++)           /* N */
+            {
+              for (inCh = 0; inCh < numInCh; inCh++)             /* D */
+              {
+                inIdx = ky * inCoeffPitch3 + kx * inCoeffPitch2 + \
+                        inCh * inCoeffPitch1 + outCh;
+                pSubCoeff[outIdx++] = pInCoeff[inIdx];
+              }
+              /* For stride alignment */
+              outIdx += (outIdx % (2 * XCHAL_IVPN_SIMD_WIDTH)) ? ((2 * XCHAL_IVPN_SIMD_WIDTH) - (outIdx % (2 * XCHAL_IVPN_SIMD_WIDTH))) : 0;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /* Form super-kernels by stacking sub-kernels */
+  for (kernelIdx = 0; kernelIdx < strideY; kernelIdx++)
+  {
+    pSuperCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(superCoeffs[kernelIdx]);
+
+    kW = XAI_TILE4D_GET_DIM3(superCoeffs[kernelIdx]);
+    kH = XAI_TILE4D_GET_DIM4(superCoeffs[kernelIdx]);
+
+    numInChSubCoeff = XAI_TILE4D_GET_DIM1(subCoeffs[kernelIdx * strideX]);
+    superKPitch1    = XAI_TILE4D_GET_DIM1_PITCH(superCoeffs[kernelIdx]);
+    superKPitch2    = XAI_TILE4D_GET_DIM2_PITCH(superCoeffs[kernelIdx]);
+
+    for (subKIdx = 0; subKIdx < strideX; subKIdx++)
+    {
+      pSubCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx * strideX + subKIdx]);
+
+      subkW = XAI_TILE4D_GET_DIM3(subCoeffs[kernelIdx * strideX + subKIdx]);
+
+      subKPitch1 = XAI_TILE4D_GET_DIM1_PITCH(subCoeffs[kernelIdx * strideX + subKIdx]);
+      subKPitch2 = XAI_TILE4D_GET_DIM2_PITCH(subCoeffs[kernelIdx * strideX + subKIdx]);
+      subKPitch3 = XAI_TILE4D_GET_DIM3_PITCH(subCoeffs[kernelIdx * strideX + subKIdx]);
+
+      outIdx = numInChSubCoeff * subKIdx;
+
+      for (ky = 0, kIdy = 0; ky < kH; ky++, kIdy++)          /* H */
+      {
+        for (kx = 0, kIdx = 0; kx < kW; kx++, kIdx++)        /* W */
+        {
+          /*In case of super kernels we have the first sub kernel width/height as the width/height of the superkernel     */
+          /*In case the widths of the subkernel are not equal then we skip by differnce and start filling                 */
+          /*Once the convolution is done the output junk data apprears at the end of the outtile.                         */
+          /*In case of unequal heights this is handled using pointers in test app.                                        */
+          if ((subkW < kW) && (kx == 0))
+          {
+            outIdx += superKPitch2;
+            kIdx--;
+            continue;
+          }
+          for (outCh = 0; outCh < numOutCh; outCh++)         /* N */
+          {
+            for (inCh = 0; inCh < numInChSubCoeff; inCh++)   /* D */
+            {
+              inIdx = kIdy * subKPitch3 + kIdx * subKPitch2 + \
+                      outCh * subKPitch1 + inCh;
+              pSuperCoeff[outIdx++] = pSubCoeff[inIdx];
+            }
+            outIdx += (superKPitch1 - numInChSubCoeff);
+          }
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************************************************************/
+/* Description : Implementation for coefficient reordering                  */
+/*               The functions does the following:                          */
+/*               - Flips the coefficients across width and height which is  */
+/*                 controlled by transposeCoeffsFlag.                       */
+/*               - Breaks the kernel into sub-kernels.                      */
+/* Inputs      : Input Coeff Tile, CNN convolution params structure,        */
+/*               transposeCoeffsFlag                                        */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Array of Coeff Sub  Tiles                                  */
+/* Assumptions : CoeffData is S8/U8                                         */
+/*               Coeff is in DWH format                                     */
+/****************************************************************************/
+XAI_ERR_TYPE xaiDeConvReOrder3D_I8_DWH(const xai_pTile3D inTile,
+                                       xai_pTile3D subCoeffs[],
+                                       const xai_cnn_depthwiseDilatedConv_params *param,
+                                       const uint8_t transposeCoeffsFlag)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_I8(inTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_POINTER(subCoeffs);
+    XAI_CHECK_ERROR(((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param) >= 1) &&                                          \
+                     (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param) <= XAI_TILE3D_GET_DIM2(inTile))) &&               \
+                    ((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param) >= 1) &&                                          \
+                     (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param) <= XAI_TILE3D_GET_DIM3(inTile))), XAI_ERR_BADARG, \
+                    "StrideX = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Width) \
+       \nStrideY = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Height)",           \
+                    XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param), XAI_TILE3D_GET_DIM2(inTile),                       \
+                    XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param), XAI_TILE3D_GET_DIM3(inTile));
+    XAI_CHECK_ERROR((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param) == 1),                     \
+                    XAI_ERR_BADARG, "\nDilation is %hhu\nDilation parameter should be equal to 1", \
+                    XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param));
+    XAI_CHECK_ERROR(XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(param) == XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(param), \
+                    XAI_ERR_BADARG, "\nDilation along width is %hhu and dilation along height is %hhu are not same",            \
+                    XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(param), XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(param));
+  }
+
+  int32_t kIdx, kIdy;
+  int32_t kernelIdx;
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    for (kIdy = 0; kIdy < XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param); kIdy++)
+    {
+      for (kIdx = 0; kIdx < XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param); kIdx++)
+      {
+        kernelIdx = kIdy * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param) + kIdx;
+        XAI_CHECK_TILE3D_I8(subCoeffs[kernelIdx]);
+        XAI_CHECK_TILE3D_DATA_ORDER(subCoeffs[kernelIdx], XAI_DWH);
+      }
+    }
+  }
+
+  int8_t *pInCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(inTile);
+
+
+  const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile);    /* D */
+  const int32_t kWidth  = XAI_TILE3D_GET_DIM2(inTile);    /* W */
+  const int32_t kHeight = XAI_TILE3D_GET_DIM3(inTile);    /* H */
+
+  const uint8_t strideX = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param);
+  const uint8_t strideY = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param);
+
+  int32_t inCoeffPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  int32_t inCoeffPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+
+  int32_t kx, ky, inCh, inIdx, outIdx = 0;
+  int32_t kxStart, kyStart;
+
+  if (transposeCoeffsFlag)
+  {
+    /* transposing of kernels and formation of sub-kernels */
+    for (kIdy = 0; kIdy < strideY; kIdy++)
+    {
+      for (kIdx = 0; kIdx < strideX; kIdx++)
+      {
+        kernelIdx = kIdy * strideX + kIdx;
+        int8_t *pSubCoeff = (int8_t *) XAI_TILE3D_GET_DATA_PTR(subCoeffs[kernelIdx]);
+
+        outIdx  = 0;
+        kyStart = kHeight - 1 - ((kHeight + strideY - kIdy - 1) % strideY);
+
+        for (ky = kyStart; ky >= 0; ky -= strideY)          /* H */
+        {
+          kxStart = kWidth - 1 - ((kWidth + strideX - kIdx - 1) % strideX);
+
+          for (kx = kxStart; kx >= 0; kx -= strideX)        /* W */
+          {
+            for (inCh = 0; inCh < numInCh; inCh++)          /* D */
+            {
+              inIdx               = ky * inCoeffPitch2 + kx * inCoeffPitch1 + inCh;
+              pSubCoeff[outIdx++] = pInCoeff[inIdx];
+            }
+            /* For stride alignment */
+            outIdx += (outIdx % (2 * XCHAL_IVPN_SIMD_WIDTH)) ? ((2 * XCHAL_IVPN_SIMD_WIDTH) - (outIdx % (2 * XCHAL_IVPN_SIMD_WIDTH))) : 0;
+          }
+        }
+      }
+    }
+  }
+  else
+  {
+    /* Formation of sub-kernels */
+    for (kIdy = 0; kIdy < strideY; kIdy++)
+    {
+      for (kIdx = 0; kIdx < strideX; kIdx++)
+      {
+        kernelIdx = kIdy * strideX + kIdx;
+        int8_t *pSubCoeff = (int8_t *) XAI_TILE3D_GET_DATA_PTR(subCoeffs[kernelIdx]);
+
+        outIdx  = 0;
+        kyStart = ((kHeight + strideY - kIdy - 1) % strideY);
+
+        for (ky = kyStart; ky < kHeight; ky += strideY)          /* H */
+        {
+          kxStart = ((kWidth + strideX - kIdx - 1) % strideX);
+
+          for (kx = kxStart; kx < kWidth; kx += strideX)         /* W */
+          {
+            for (inCh = 0; inCh < numInCh; inCh++)             /* D */
+            {
+              inIdx               = ky * inCoeffPitch2 + kx * inCoeffPitch1 + inCh;
+              pSubCoeff[outIdx++] = pInCoeff[inIdx];
+            }
+            /* For stride alignment */
+            outIdx += (outIdx % (2 * XCHAL_IVPN_SIMD_WIDTH)) ? ((2 * XCHAL_IVPN_SIMD_WIDTH) - (outIdx % (2 * XCHAL_IVPN_SIMD_WIDTH))) : 0;
+          }
+        }
+      }
+    }
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************************************************************/
+/* Description : Vision P6 implementation for interleaving the outputs      */
+/*               generated by convolution functions using the sub-kernels   */
+/* Inputs      : array of output tiles passed as input, CNN convolution     */
+/*               params structure, output tile                              */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : output tile                                                */
+/* Assumptions : Input Tile Data is S8/U8                                   */
+/****************************************************************************/
+XAI_ERR_TYPE xaiDeConvInterleave3D_I8_WHD(const xai_pTile3D inTile[],
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_conv_params *convParams)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(inTile);
+    XAI_CHECK_POINTER(convParams);
+    XAI_CHECK_TILE3D_I8(outTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(outTile);
+  }
+  /* Getting parameters from the tile structures */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(convParams);
+  const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(convParams);
+
+  const int32_t outDataPitch1Offset = (outDataPitch1 * strideY);
+
+  int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t ch, x, y, numX, numY, idx, remX;
+  int8_t *pSubKernelOutput;
+  int8_t *pOutput1;
+  int8_t *pOutput2;
+  int8_t *pInput1;
+  int8_t *pInput2;
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    XAI_CHECK_ERROR(((strideX > 0) && (strideY > 0)), XAI_ERR_BADARG,        \
+                    "strideX = %hhu, strideY = %hhu\nStride has to be >= 1", \
+                    strideX, strideY);
+
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outTile) >= strideX) &&                      \
+                    (XAI_TILE3D_GET_DIM2(outTile) >= strideY), XAI_ERR_BADARG,        \
+                    "\nOutTile width = %d, value must be greater than or equal to %hhu(strideX) \
+       \nOutTile height = %d,  value must be greater than or equal to %hhu(strideY)", \
+                    XAI_TILE3D_GET_DIM1(outTile), strideX, XAI_TILE3D_GET_DIM2(outTile), strideY);
+
+    for (numY = 0; numY < strideY; numY++)
+    {
+      for (numX = 0; numX < strideX; numX++)
+      {
+        idx = numX + numY * strideX;
+        XAI_CHECK_POINTER(inTile[idx]);
+        XAI_CHECK_TILE3D_I8(inTile[idx]);
+        XAI_CHECK_TILE3D_DATA_ORDER(inTile[idx], XAI_WHD);
+        XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile[idx], outTile);
+        XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inTile[idx]) == XAI_TILE3D_GET_DIM3(outTile), XAI_ERR_BADARG, \
+                        "\nNumber of channels of each subkernel output = %d, final output = %d \
+            \nNumber of channels of each subkernel output and final output should be the same",           \
+                        XAI_TILE3D_GET_DIM3(inTile[idx]), XAI_TILE3D_GET_DIM3(outTile));
+      }
+    }
+  }
+
+  /* Scatter Index Calculations */
+  /* Sequence - 0 1 2 3 4 ... 30 31 */
+  xb_vecNx16U vecSelIdx1 = IVP_SEQNX16U();
+  /* Sequence - 0 strideX 2*strideX 3*strideX 4*strideX .... 30*strideX 31*strideX*/
+  xb_vecNx16U vecScatterOff1 = IVP_MULNX16UPACKL(vecSelIdx1, \
+                                                 (uint16_t) strideX);
+
+  xb_vecNx16U vecScatterOff2;
+  /* Sequence - (32*strideX) (33*strideX) (34*strideX) ....(62*strideX) (63*strideX)*/
+  vecScatterOff2 = IVP_ADDNX16(vecScatterOff1, (XCHAL_IVPN_SIMD_WIDTH * strideX));
+
+  xb_vec2Nx8* restrict pdvecIn1;
+  xb_vec2Nx8 dvecData1;
+  valign vaInData1;
+  vbool2N vecMsk;
+  vboolN vecOffsetMsk1;
+  vboolN vecOffsetMsk2;
+  /* Sequence - 0 1 2 3 4 ... 62 63 */
+  xb_vec2Nx8 vecCmp = IVP_SEQ2NX8U();
+  /* Sequence - 0 1 2 3 4 ... 30 31 */
+  xb_vecNx16U vecOffsetCmp = IVP_SEQNX16U();
+
+  const int32_t vectorizationWidth = 2 * XCHAL_IVPN_SIMD_WIDTH;
+
+  for (numY = 0; numY < strideY; numY++)
+  {
+    for (numX = 0; numX < strideX; numX++)
+    {
+      idx = numX + numY * strideX;
+      int8_t *pInput             = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile[idx]);
+      const int32_t inDataWidth  = XAI_TILE3D_GET_DIM1(inTile[idx]);
+      const int32_t inDataHeight = XAI_TILE3D_GET_DIM2(inTile[idx]);
+      const int32_t inChanNum    = XAI_TILE3D_GET_DIM3(inTile[idx]);
+      const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile[idx]);
+      const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile[idx]);
+      pSubKernelOutput = (pOutput + numX + (numY * outDataPitch1));
+      for (ch = 0; ch < inChanNum; ch++)
+      {
+        pOutput1 = (pSubKernelOutput + (ch * outDataPitch2));
+        pInput1  = (pInput + (ch * inDataPitch2));
+        for (x = 0; x <= (inDataWidth - vectorizationWidth); x += (vectorizationWidth))
+        {
+          pInput2  = (pInput1 + x);
+          pOutput2 = (pOutput1 + (x * strideX));
+          pdvecIn1 = (xb_vec2Nx8 *) pInput2;
+          for (y = 0; y < inDataHeight; y++)
+          {
+            vaInData1 = IVP_LA2NX8_PP(pdvecIn1);
+            IVP_LA2NX8_XP(dvecData1, vaInData1, pdvecIn1, inDataPitch1);
+            IVP_SCATTER2NX8_L(dvecData1, pOutput2, vecScatterOff1);
+            IVP_SCATTER2NX8_H(dvecData1, pOutput2, vecScatterOff2);
+            pOutput2 += outDataPitch1Offset;
+          }
+        }
+        /*To perform Interleaving for inputData widths that are less than the vectorization width*/
+        if (inDataWidth - x)
+        {
+          pInput2  = (pInput1 + x);
+          pOutput2 = ((pOutput1 + (x * strideX)));
+          pdvecIn1 = (xb_vec2Nx8 *) pInput2;
+          remX     = (inDataWidth - x);
+          /*Creating Mask to scatter only the availble valid inputs that should be interleaved*/
+          vecMsk = IVP_LT2NX8(vecCmp, remX);
+          /*Creating Mask for the scatter operation to have only valid offsets based on the available inputs*/
+          vecOffsetMsk1 = IVP_LTNX16(vecOffsetCmp, remX);
+          vecOffsetMsk2 = IVP_LTNX16(vecOffsetCmp, (remX - XCHAL_IVPN_SIMD_WIDTH));
+          for (y = 0; y < inDataHeight; y++)
+          {
+            vaInData1 = IVP_LA2NX8_PP(pdvecIn1);
+            IVP_LA2NX8_XP(dvecData1, vaInData1, pdvecIn1, inDataPitch1);
+            IVP_SCATTER2NX8T_L(dvecData1, pOutput2, IVP_MOVNX16T(vecScatterOff1, 0, vecOffsetMsk1), (vecMsk));
+            IVP_SCATTER2NX8T_H(dvecData1, pOutput2, IVP_MOVNX16T(vecScatterOff2, 0, vecOffsetMsk2), (vecMsk));
+            pOutput2 += outDataPitch1Offset;
+          }
+        }
+      }
+    }
+  }
+
+  IVP_SCATTERW();  /* Adding Memory Wait until all the scatter and store operations are completed */
+
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************************************************************/
+/* Description : Vision P6 implementation for interleaving the outputs      */
+/*               generated by convolution functions using the sub-kernels   */
+/* Inputs      : array of output tiles passed as input, CNN convolution     */
+/*               params structure, output tile                              */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : output tile                                                */
+/* Assumptions : Input Tile Data is S8/U8                                   */
+/****************************************************************************/
+XAI_ERR_TYPE xaiDepthwiseDeConvInterleave3D_I8_WHD(const xai_pTile3D inTile[],
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_depthwiseDilatedConv_params *convParams)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(inTile);
+    XAI_CHECK_POINTER(convParams);
+    XAI_CHECK_TILE3D_I8(outTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(outTile);
+  }
+  /* Getting parameters from the tile structures */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  const uint8_t strideX = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(convParams);
+  const uint8_t strideY = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(convParams);
+
+  const int32_t outDataPitch1Offset = (outDataPitch1 * strideY);
+
+  int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t ch, x, y, numX, numY, idx, remX;
+  int8_t *pSubKernelOutput;
+  int8_t *pOutput1;
+  int8_t *pOutput2;
+  int8_t *pInput1;
+  int8_t *pInput2;
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    XAI_CHECK_ERROR(((strideX > 0) && (strideY > 0)),
+                    XAI_ERR_BADARG, "strideX = %hhu, strideY = %hhu\nStride has to be >= 1", \
+                    strideX, strideY);
+
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outTile) >= strideX) &&                      \
+                    (XAI_TILE3D_GET_DIM2(outTile) >= strideY), XAI_ERR_BADARG,        \
+                    "\nOutTile width = %d, value must be greater than or equal to %hhu(strideX) \
+       \nOutTile height = %d,  value must be greater than or equal to %hhu(strideY)", \
+                    XAI_TILE3D_GET_DIM1(outTile), strideX, XAI_TILE3D_GET_DIM2(outTile), strideY);
+
+    for (numY = 0; numY < strideY; numY++)
+    {
+      for (numX = 0; numX < strideX; numX++)
+      {
+        idx = numX + numY * strideX;
+        XAI_CHECK_POINTER(inTile[idx]);
+        XAI_CHECK_TILE3D_I8(inTile[idx]);
+        XAI_CHECK_TILE3D_DATA_ORDER(inTile[idx], XAI_WHD);
+        XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile[idx], outTile);
+        XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inTile[idx]) == XAI_TILE3D_GET_DIM3(outTile), XAI_ERR_BADARG, \
+                        "\nNumber of channels of each subkernel output = %d, final output = %d \
+           \nNumber of channels of each subkernel output and final output should be the same",            \
+                        XAI_TILE3D_GET_DIM3(inTile[idx]), XAI_TILE3D_GET_DIM3(outTile));
+      }
+    }
+  }
+
+  /* Scatter Index Calculations */
+  /* Sequence - 0 1 2 3 4 ... 30 31 */
+  xb_vecNx16U vecSelIdx1 = IVP_SEQNX16U();
+  /* Sequence - 0 strideX 2*strideX 3*strideX 4*strideX .... 30*strideX 31*strideX*/
+  xb_vecNx16U vecScatterOff1 = IVP_MULNX16UPACKL(vecSelIdx1, \
+                                                 (uint16_t) strideX);
+
+  xb_vecNx16U vecScatterOff2;
+  /* Sequence - (32*strideX) (33*strideX) (34*strideX) ....(62*strideX) (63*strideX)*/
+  vecScatterOff2 = IVP_ADDNX16(vecScatterOff1, (XCHAL_IVPN_SIMD_WIDTH * strideX));
+
+  xb_vec2Nx8* restrict pdvecIn1;
+  xb_vec2Nx8 dvecData1;
+  valign vaInData1;
+  vbool2N vecMsk;
+  vboolN vecOffsetMsk1;
+  vboolN vecOffsetMsk2;
+  /* Sequence - 0 1 2 3 4 ... 62 63 */
+  xb_vec2Nx8 vecCmp = IVP_SEQ2NX8U();
+  /* Sequence - 0 1 2 3 4 ... 30 31 */
+  xb_vecNx16U vecOffsetCmp = IVP_SEQNX16U();
+
+  const int32_t vectorizationWidth = 2 * XCHAL_IVPN_SIMD_WIDTH;
+
+  for (numY = 0; numY < strideY; numY++)
+  {
+    for (numX = 0; numX < strideX; numX++)
+    {
+      idx = numX + numY * strideX;
+      int8_t *pInput             = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile[idx]);
+      const int32_t inDataWidth  = XAI_TILE3D_GET_DIM1(inTile[idx]);
+      const int32_t inDataHeight = XAI_TILE3D_GET_DIM2(inTile[idx]);
+      const int32_t inChanNum    = XAI_TILE3D_GET_DIM3(inTile[idx]);
+      const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile[idx]);
+      const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile[idx]);
+      pSubKernelOutput = (pOutput + numX + (numY * outDataPitch1));
+      for (ch = 0; ch < inChanNum; ch++)
+      {
+        pOutput1 = (pSubKernelOutput + (ch * outDataPitch2));
+        pInput1  = (pInput + (ch * inDataPitch2));
+        for (x = 0; x <= (inDataWidth - vectorizationWidth); x += (vectorizationWidth))
+        {
+          pInput2  = (pInput1 + x);
+          pOutput2 = (pOutput1 + (x * strideX));
+          pdvecIn1 = (xb_vec2Nx8 *) pInput2;
+          for (y = 0; y < inDataHeight; y++)
+          {
+            vaInData1 = IVP_LA2NX8_PP(pdvecIn1);
+            IVP_LA2NX8_XP(dvecData1, vaInData1, pdvecIn1, inDataPitch1);
+            IVP_SCATTER2NX8_L(dvecData1, pOutput2, vecScatterOff1);
+            IVP_SCATTER2NX8_H(dvecData1, pOutput2, vecScatterOff2);
+            pOutput2 += outDataPitch1Offset;
+          }
+        }
+        /*To perform Interleaving for inputData widths that are less than the vectorization width*/
+        if (inDataWidth - x)
+        {
+          pInput2  = (pInput1 + x);
+          pOutput2 = ((pOutput1 + (x * strideX)));
+          pdvecIn1 = (xb_vec2Nx8 *) pInput2;
+          remX     = (inDataWidth - x);
+          /*Creating Mask to scatter only the availble valid inputs that should be interleaved*/
+          vecMsk = IVP_LT2NX8(vecCmp, remX);
+          /*Creating Mask for the scatter operation to have only valid offsets based on the available inputs*/
+          vecOffsetMsk1 = IVP_LTNX16(vecOffsetCmp, remX);
+          vecOffsetMsk2 = IVP_LTNX16(vecOffsetCmp, (remX - XCHAL_IVPN_SIMD_WIDTH));
+          for (y = 0; y < inDataHeight; y++)
+          {
+            vaInData1 = IVP_LA2NX8_PP(pdvecIn1);
+            IVP_LA2NX8_XP(dvecData1, vaInData1, pdvecIn1, inDataPitch1);
+            IVP_SCATTER2NX8T_L(dvecData1, pOutput2, IVP_MOVNX16T(vecScatterOff1, 0, vecOffsetMsk1), (vecMsk));
+            IVP_SCATTER2NX8T_H(dvecData1, pOutput2, IVP_MOVNX16T(vecScatterOff2, 0, vecOffsetMsk2), (vecMsk));
+            pOutput2 += outDataPitch1Offset;
+          }
+        }
+      }
+    }
+  }
+
+  IVP_SCATTERW();  /* Adding Memory Wait until all the scatter and store operations are completed */
+
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************************************************************/
+/* Description : Vision P6 implementation for interleaving the outputs      */
+/*               generated by convolution functions using the sub-kernels   */
+/* Inputs      : array of output tiles passed as input, CNN convolution     */
+/*               params structure, output tile                              */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : output tile                                                */
+/* Assumptions : Input Tile Data is I16                                     */
+/****************************************************************************/
+
+XAI_ERR_TYPE xaiDeConvInterleave3D_I16_WHD(const xai_pTile3D inTile[],
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_conv_params *convParams)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(inTile);
+    XAI_CHECK_POINTER(convParams);
+    XAI_CHECK_TILE3D_I16(outTile);
+    XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD);
+    XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(outTile);
+  }
+
+  /* Getting parameters from the tile structures */
+  const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  const uint8_t strideX             = XAI_CNN_CONV_GET_STRIDEX(convParams);
+  const uint8_t strideY             = XAI_CNN_CONV_GET_STRIDEY(convParams);
+  const int32_t outDataPitch1Offset = (outDataPitch1 * strideY);
+
+  int16_t *pOutput = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t ch, x, y, numX, numY, idx, remX;
+  int16_t *pSubKernelOutput;
+  int16_t *pOutput1;
+  int16_t *pOutput2;
+  int16_t *pInput1;
+  int16_t *pInput2;
+
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    XAI_CHECK_ERROR(((strideX > 0) && (strideY > 0)),
+                    XAI_ERR_BADARG, "strideX = %hhu, strideY = %hhu\nStride has to be >= 1", \
+                    strideX, strideY);
+
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outTile) >= strideX) &&                      \
+                    (XAI_TILE3D_GET_DIM2(outTile) >= strideY), XAI_ERR_BADARG,        \
+                    "\nOutTile width = %d, value must be greater than or equal to %hhu(strideX) \
+       \nOutTile height = %d,  value must be greater than or equal to %hhu(strideY)", \
+                    XAI_TILE3D_GET_DIM1(outTile), strideX, XAI_TILE3D_GET_DIM2(outTile), strideY);
+
+    for (numY = 0; numY < strideY; numY++)
+    {
+      for (numX = 0; numX < strideX; numX++)
+      {
+        idx = numX + numY * strideX;
+        XAI_CHECK_POINTER(inTile[idx]);
+        XAI_CHECK_TILE3D_I16(inTile[idx]);
+        XAI_CHECK_TILE3D_DATA_ORDER(inTile[idx], XAI_WHD);
+        XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile[idx], outTile);
+        XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inTile[idx]) == XAI_TILE3D_GET_DIM3(outTile), XAI_ERR_BADARG, \
+                        "\nNumber of channels of each subkernel output = %d, final output = %d \
+           \nNumber of channels of each subkernel output and final output should be the same",            \
+                        XAI_TILE3D_GET_DIM3(inTile[idx]), XAI_TILE3D_GET_DIM3(outTile));
+      }
+    }
+  }
+
+  /* Scatter Index Calculations */
+  /* Sequence - 0 1 2 3 4 ... 30 31 */
+  xb_vecNx16U vecSelIdx1 = IVP_SEQNX16U();
+  /* Sequence - 0 strideX 2*strideX 3*strideX 4*strideX .... 30*strideX 31*strideX*/
+  xb_vecNx16U vecScatterOff1 = IVP_MULNX16UPACKL(vecSelIdx1, \
+                                                 (uint16_t) strideX * 2);
+
+  xb_vecNx16* restrict pdvecIn1;
+  xb_vecNx16 dvecData1;
+  valign vaInData1;
+  vboolN vecMsk;
+  vboolN vecOffsetMsk1;
+  /* Sequence - 0 1 2 3 4 ... 30 31 */
+  xb_vecNx16U vecCmp       = IVP_SEQNX16U();
+  xb_vecNx16U vecOffsetCmp = IVP_SEQNX16U();
+
+
+  const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH;
+
+  for (numY = 0; numY < strideY; numY++)
+  {
+    for (numX = 0; numX < strideX; numX++)
+    {
+      idx = numX + numY * strideX;
+      int16_t *pInput            = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile[idx]);
+      const int32_t inDataWidth  = XAI_TILE3D_GET_DIM1(inTile[idx]);
+      const int32_t inDataHeight = XAI_TILE3D_GET_DIM2(inTile[idx]);
+      const int32_t inChanNum    = XAI_TILE3D_GET_DIM3(inTile[idx]);
+      const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile[idx]);
+      const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile[idx]);
+      pSubKernelOutput = (pOutput + numX + (numY * outDataPitch1));
+      for (ch = 0; ch < inChanNum; ch++)
+      {
+        pOutput1 = (pSubKernelOutput + (ch * outDataPitch2));
+        pInput1  = (pInput + (ch * inDataPitch2));
+        for (x = 0; x <= (inDataWidth - vectorizationWidth); x += (vectorizationWidth))
+        {
+          pInput2  = (pInput1 + x);
+          pOutput2 = (pOutput1 + (x * strideX));
+          pdvecIn1 = (xb_vecNx16 *) pInput2;
+          for (y = 0; y < inDataHeight; y++)
+          {
+            vaInData1 = IVP_LANX16_PP(pdvecIn1);
+            IVP_LANX16_XP(dvecData1, vaInData1, pdvecIn1, (inDataPitch1 << 1));
+            IVP_SCATTERNX16(dvecData1, pOutput2, vecScatterOff1);
+            pOutput2 += outDataPitch1Offset;
+          }
+        }
+        /*To perform Interleaving for inputData widths that are less than the vectorization width*/
+        if (inDataWidth - x)
+        {
+          pInput2  = (pInput1 + x);
+          pOutput2 = ((pOutput1 + (x * strideX)));
+          pdvecIn1 = (xb_vecNx16 *) (pInput2);
+          remX     = (inDataWidth - x);
+          /*Creating Mask to scatter only the availble valid inputs that should be interleaved*/
+          vecMsk = IVP_LTNX16(vecCmp, remX);
+          /*Creating Mask for the scatter operation to have only valid offsets based on the available inputs*/
+          vecOffsetMsk1 = IVP_LTNX16(vecOffsetCmp, remX);
+          for (y = 0; y < inDataHeight; y++)
+          {
+            vaInData1 = IVP_LANX16_PP(pdvecIn1);
+            IVP_LANX16_XP(dvecData1, vaInData1, pdvecIn1, (inDataPitch1 << 1));
+            IVP_SCATTERNX16T(dvecData1, pOutput2, IVP_MOVNX16T(vecScatterOff1, 0, vecOffsetMsk1), (vecMsk));
+            pOutput2 += outDataPitch1Offset;
+          }
+        }
+      }
+    }
+  }
+
+  IVP_SCATTERW();  /* Adding Memory Wait until all the scatter and store operations are completed */
+
+  return(XAI_ERROR_STATUS());
+}
+
+/**********************xaiConvolvedBiasUpdate_S8S32*************************/
+/* Description : Implementation of BiasUpdate calculation for             */
+/*               It modifies the bias value by adding a fixup             */
+/*               term to it. This function is called along with,          */
+/*               Convolved3D_MOD functions which accepts U8 input tile    */
+/*               and converts to S8 and also S8 coeff tile                */
+/* Inputs      : Coeff Tile                                               */
+/* InOuts      : biasArray                                                */
+/* Assumptions : coeffData is S8 and biasData is S32                      */
+/*               Coefficient tile is in NDWH format                       */
+/**************************************************************************/
+XAI_ERR_TYPE xaiConvolvedBiasUpdate_S8S32(const xai_pTile4D coeffTile,
+                                          xai_pArray biasArray
+                                          )
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE4D_S8(coeffTile);
+    XAI_CHECK_ARRAY_S32(biasArray);
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTile) <= XAI_ARRAY_GET_WIDTH(biasArray)), XAI_ERR_BADARG,                                        \
+                    "\nNumber of Kernels = %d, Width of Bias Array = %d\nNumber of Kernels must be less than or equal to Width of Bias Array", \
+                    XAI_TILE4D_GET_DIM1(coeffTile), XAI_ARRAY_GET_WIDTH(biasArray));
+  }
+#ifndef IVP_MULSUQA2N8XR8
+  /* Data Pointers of input, output, coefficient and bias data */
+  int8_t *pCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile);
+  int32_t *pBias = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray);
+
+  /* Vector Pointers */
+  xb_vec2Nx8*    restrict pdvecCoeff;
+  xb_vecN_2x32v* restrict phvecBias = (xb_vecN_2x32v *) (pBias);
+  xb_vecN_2x32v* phvecBiasIn        = phvecBias;
+  xb_vecN_2x32v* phvecBiasOut       = phvecBias;
+  valign vaInBias                   = IVP_LAN_2X32_PP(phvecBiasIn);
+  valign vaOutBias                  = IVP_ZALIGN();
+
+  /* Getting parameters from the tile structures */
+  const int32_t outChanNum      = XAI_TILE4D_GET_DIM1(coeffTile);
+  const int32_t inChanNum       = XAI_TILE4D_GET_DIM2(coeffTile);
+  const uint16_t kWidthU        = XAI_TILE4D_GET_DIM3(coeffTile);
+  const uint16_t kHeightU       = XAI_TILE4D_GET_DIM4(coeffTile);
+  const int32_t coeffDataPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile);
+  const int32_t coeffDataPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile);
+  const int32_t coeffDataPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile);
+  int32_t accOverflowFlag       = 0;
+
+  int32_t outCh, kx, ky, inCh;
+  /*
+     IF inputdata is S8
+     convolutionS8 = summation(InputData * CoeffData)
+     IF inputdata is U8
+     convolutionU8 = summation((InputData - 128) * CoeffData) + summation(128 * CoeffData)
+                = convolutionS8 + summation(128 * CoeffData)
+                = convolutionS8 + 128 * summation( CoeffData)
+                  128 * summation( CoeffData) is performed below
+   */
+
+  const int32_t vectorizationWidth = (XCHAL_IVPN_SIMD_WIDTH << 1);
+
+  /* Iterate Over OutChannels */
+  for (outCh = 0; outCh < outChanNum; outCh += vectorizationWidth)
+  {
+    /* Calculate remaining output channels */
+    int32_t remOutCh = (outChanNum - outCh);
+
+    /* Initialize Accumulator Vector */
+    xb_vec2Nx24 daccSum = IVP_ZERO2NX24();
+
+    /* Computes the sum of coeffs corresponding to the same outChannel */
+    for (ky = 0; ky < kHeightU; ky++)
+    {
+      for (kx = 0; kx < kWidthU; kx++)
+      {
+        int32_t coeffIdx = outCh + kx * coeffDataPitch2 + ky * coeffDataPitch3;
+        pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffIdx);
+
+        for (inCh = 0; inCh < inChanNum - 3; inCh += 4)
+        {
+          xb_vec2Nx8 dvecCoeff1, dvecCoeff2, dvecCoeff3, dvecCoeff4;
+
+          IVP_L2U2NX8_XP(dvecCoeff1, pdvecCoeff, coeffDataPitch1);
+          IVP_L2U2NX8_XP(dvecCoeff2, pdvecCoeff, coeffDataPitch1);
+          IVP_L2U2NX8_XP(dvecCoeff3, pdvecCoeff, coeffDataPitch1);
+          IVP_L2U2NX8_XP(dvecCoeff4, pdvecCoeff, coeffDataPitch1);
+
+          IVP_ADDWA2NX8(daccSum, dvecCoeff2, dvecCoeff1);
+          IVP_ADDWA2NX8(daccSum, dvecCoeff4, dvecCoeff3);
+        }
+        for (; inCh < inChanNum - 1; inCh += 2)
+        {
+          xb_vec2Nx8 dvecCoeff1, dvecCoeff2;
+
+          IVP_L2U2NX8_XP(dvecCoeff1, pdvecCoeff, coeffDataPitch1);
+          IVP_L2U2NX8_XP(dvecCoeff2, pdvecCoeff, coeffDataPitch1);
+
+          IVP_ADDWA2NX8(daccSum, dvecCoeff2, dvecCoeff1);
+        }
+        if (inCh < inChanNum)
+        {
+          xb_vec2Nx8 dvecCoeff;
+
+          IVP_L2U2NX8_XP(dvecCoeff, pdvecCoeff, coeffDataPitch1);
+
+          IVP_ADDWA2NX8(daccSum, (xb_vec2Nx8) 0, dvecCoeff);
+        }
+      }
+    }
+
+    /* Add Adjustment for Bias to Bias Vectors */
+    xb_vecN_2x32v hvecBiasLL, hvecBiasLH, hvecBiasHL, hvecBiasHH;
+    int32_t remBiasBytes = remOutCh * 4;
+
+    /* Number of channels processed by N_2-way 32-bit vector */
+    const int32_t numProcessCh = XCHAL_IVPN_SIMD_WIDTH >> 1;
+
+    /* Convert Accumulated Double Accumulator Values to 4 Half Vectors */
+    xb_vecN_2x32v hvecAccLL, hvecAccLH, hvecAccHL, hvecAccHH;
+    hvecAccLL = IVP_CVT32S2NX24LL(daccSum); hvecAccLL = IVP_SLAN_2X32(hvecAccLL, 7);
+    hvecAccLH = IVP_CVT32S2NX24LH(daccSum); hvecAccLH = IVP_SLAN_2X32(hvecAccLH, 7);
+    hvecAccHL = IVP_CVT32S2NX24HL(daccSum); hvecAccHL = IVP_SLAN_2X32(hvecAccHL, 7);
+    hvecAccHH = IVP_CVT32S2NX24HH(daccSum); hvecAccHH = IVP_SLAN_2X32(hvecAccHH, 7);
+
+    hvecAccLL = IVP_MOVN_2X32T(hvecAccLL, (xb_vecN_2x32v) 0, \
+                               IVP_LTN_2X32(IVP_SEQN_2X32(), (xb_vecN_2x32v) (remOutCh)));
+    hvecAccLH = IVP_MOVN_2X32T(hvecAccLH, (xb_vecN_2x32v) 0, \
+                               IVP_LTN_2X32(IVP_SEQN_2X32(), (xb_vecN_2x32v) (remOutCh - (numProcessCh))));
+    hvecAccHL = IVP_MOVN_2X32T(hvecAccHL, (xb_vecN_2x32v) 0, \
+                               IVP_LTN_2X32(IVP_SEQN_2X32(), (xb_vecN_2x32v) (remOutCh - (2 * numProcessCh))));
+    hvecAccHH = IVP_MOVN_2X32T(hvecAccHH, (xb_vecN_2x32v) 0, \
+                               IVP_LTN_2X32(IVP_SEQN_2X32(), (xb_vecN_2x32v) (remOutCh - (3 * numProcessCh))));
+
+    IVP_LAVN_2X32_XP(hvecBiasLL, vaInBias, phvecBiasIn, remBiasBytes);
+    IVP_LAVN_2X32_XP(hvecBiasLH, vaInBias, phvecBiasIn, remBiasBytes - (2 * XCHAL_IVPN_SIMD_WIDTH));
+    IVP_LAVN_2X32_XP(hvecBiasHL, vaInBias, phvecBiasIn, remBiasBytes - (4 * XCHAL_IVPN_SIMD_WIDTH));
+    IVP_LAVN_2X32_XP(hvecBiasHH, vaInBias, phvecBiasIn, remBiasBytes - (6 * XCHAL_IVPN_SIMD_WIDTH));
+
+    /* Add Bias and its Adjustment */
+    hvecBiasLL = IVP_ADDN_2X32(hvecBiasLL, hvecAccLL);
+    hvecBiasLH = IVP_ADDN_2X32(hvecBiasLH, hvecAccLH);
+    hvecBiasHL = IVP_ADDN_2X32(hvecBiasHL, hvecAccHL);
+    hvecBiasHH = IVP_ADDN_2X32(hvecBiasHH, hvecAccHH);
+
+    /* Check If Overflow is present and perform shifts as per requirement*/
+    vboolN_2 hvbOverflow;
+
+    /* hvecBiasLL */
+    hvbOverflow      = IVP_ORBN_2(IVP_LTN_2X32(hvecBiasLL, S24_MIN), IVP_LTN_2X32(S24_MAX, hvecBiasLL));
+    accOverflowFlag += (int32_t) IVP_RADDN_2X32T((xb_vecN_2x32v) 1, hvbOverflow);
+    hvecBiasLL       = IVP_SLAN_2X32(hvecBiasLL, IVP_MOVN_2X32T((xb_vecN_2x32v) (8), (xb_vecN_2x32v) 0, hvbOverflow));
+    hvecBiasLL       = IVP_SLAN_2X32(hvecBiasLL, IVP_MOVN_2X32T((xb_vecN_2x32v) (-8), (xb_vecN_2x32v) 0, hvbOverflow));
+
+    /* hvecBiasLH */
+    hvbOverflow      = IVP_ORBN_2(IVP_LTN_2X32(hvecBiasLH, S24_MIN), IVP_LTN_2X32(S24_MAX, hvecBiasLH));
+    accOverflowFlag += (int32_t) IVP_RADDN_2X32T((xb_vecN_2x32v) 1, hvbOverflow);
+    hvecBiasLH       = IVP_SLAN_2X32(hvecBiasLH, IVP_MOVN_2X32T((xb_vecN_2x32v) (8), (xb_vecN_2x32v) 0, hvbOverflow));
+    hvecBiasLH       = IVP_SLAN_2X32(hvecBiasLH, IVP_MOVN_2X32T((xb_vecN_2x32v) (-8), (xb_vecN_2x32v) 0, hvbOverflow));
+
+    /* hvecBiasHL */
+    hvbOverflow      = IVP_ORBN_2(IVP_LTN_2X32(hvecBiasHL, S24_MIN), IVP_LTN_2X32(S24_MAX, hvecBiasHL));
+    accOverflowFlag += (int32_t) IVP_RADDN_2X32T((xb_vecN_2x32v) 1, hvbOverflow);
+    hvecBiasHL       = IVP_SLAN_2X32(hvecBiasHL, IVP_MOVN_2X32T((xb_vecN_2x32v) (8), (xb_vecN_2x32v) 0, hvbOverflow));
+    hvecBiasHL       = IVP_SLAN_2X32(hvecBiasHL, IVP_MOVN_2X32T((xb_vecN_2x32v) (-8), (xb_vecN_2x32v) 0, hvbOverflow));
+
+    /* hvecBiasHH */
+    hvbOverflow      = IVP_ORBN_2(IVP_LTN_2X32(hvecBiasHH, S24_MIN), IVP_LTN_2X32(S24_MAX, hvecBiasHH));
+    accOverflowFlag += (int32_t) IVP_RADDN_2X32T((xb_vecN_2x32v) 1, hvbOverflow);
+    hvecBiasHH       = IVP_SLAN_2X32(hvecBiasHH, IVP_MOVN_2X32T((xb_vecN_2x32v) (8), (xb_vecN_2x32v) 0, hvbOverflow));
+    hvecBiasHH       = IVP_SLAN_2X32(hvecBiasHH, IVP_MOVN_2X32T((xb_vecN_2x32v) (-8), (xb_vecN_2x32v) 0, hvbOverflow));
+
+    /* Store Updated Bias */
+    IVP_SAVN_2X32_XP(hvecBiasLL, vaOutBias, phvecBiasOut, remBiasBytes);
+    IVP_SAVN_2X32_XP(hvecBiasLH, vaOutBias, phvecBiasOut, remBiasBytes - (2 * XCHAL_IVPN_SIMD_WIDTH));
+    IVP_SAVN_2X32_XP(hvecBiasHL, vaOutBias, phvecBiasOut, remBiasBytes - (4 * XCHAL_IVPN_SIMD_WIDTH));
+    IVP_SAVN_2X32_XP(hvecBiasHH, vaOutBias, phvecBiasOut, remBiasBytes - (6 * XCHAL_IVPN_SIMD_WIDTH));
+  }
+
+  IVP_SAPOSN_2X32_FP(vaOutBias, phvecBiasOut);
+
+  if (accOverflowFlag)
+  {
+    return(XAI_ERR_OVERFLOW);
+  }
+#endif
+  return(XAI_ERROR_STATUS());
+}
+
+/************************  xaiReOrder4DToIN32DWH_I16  ***********************/
+/* Description : C-code implementation to reorder a tile from WHDN,        */
+/*               DWHN or NDWH into IN32DWH format                          */
+/* Inputs      : Coeff Tile  in WHDN or DWHN or NDWH format                */
+/* Outputs     : Coeff Array in IN32DWH format                             */
+/* Assumptions : The width and height of the coefficient tile are 1        */
+/*               Input and Output tiles can be S16 / U16                   */
+/***************************************************************************/
+XAI_ERR_TYPE xaiReOrder4DToIN32DWH_I16(xai_pTile4D coeffTileIn, xai_pTile4D coeffTileOut)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE4D_I16(coeffTileIn);
+    XAI_CHECK_TILE4D_I16(coeffTileOut);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN) ||                                                       \
+                    (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN) || (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_NDWH), \
+                    XAI_ERR_BADARG, "The Data Order of the input  is not supported by this function");
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTileOut, XAI_IN32DWH);
+    XAI_CHECK_DIM_IN32DWH(coeffTileIn, coeffTileOut);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DATA_PTR(coeffTileIn) != XAI_ARRAY_GET_DATA_PTR(coeffTileOut)), XAI_ERR_INPLACE, "The input and output tile pointers overlap");
+  }
+
+  int32_t numInCh, numOutCh, minCh, coeffInPitch1, coeffInPitch3;
+
+  if (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN)
+  {
+    numInCh       = XAI_TILE4D_GET_DIM3(coeffTileIn);
+    numOutCh      = XAI_TILE4D_GET_DIM4(coeffTileIn);
+    coeffInPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTileIn);
+    coeffInPitch1 = 1;
+  }
+  else if (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN)
+  {
+    numInCh       = XAI_TILE4D_GET_DIM1(coeffTileIn);
+    numOutCh      = XAI_TILE4D_GET_DIM4(coeffTileIn);
+    coeffInPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTileIn);
+    coeffInPitch1 = 1;
+  }
+  else /* If coeff tile NDWH */
+  {
+    numInCh       = XAI_TILE4D_GET_DIM2(coeffTileIn);
+    numOutCh      = XAI_TILE4D_GET_DIM1(coeffTileIn);
+    coeffInPitch3 = 1;
+    coeffInPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTileIn);
+  }
+
+  int16_t *pCoeff    = (int16_t *) XAI_TILE4D_GET_DATA_PTR(coeffTileIn);
+  int16_t *pCoeffOut = (int16_t *) XAI_ARRAY_GET_DATA_PTR(coeffTileOut);
+  int32_t i, j, k;
+
+  /* Reorder Coeff tile */
+  /*
+     The coefficient tile is reordered in the format IN64DWH:
+     d0_0,....d0_31, d1_0,...d1_31, ....dN_0,...dN_31, d0_32,....d0_63, d1_32,...d1_63, ....dN_32,...dN_63,
+     d0_64,....d0_95, d1_64,...d1_95, ....dN_64,...dN_95,...
+
+     Here, d0, d1,....dN are input channels.
+     where 'N' is the total input channels.
+     d0_0 => 0_0 => inputChNumber_outputChNumber
+   */
+
+  for (i = 0; i < numOutCh; i += XCHAL_IVPN_SIMD_WIDTH)
+  {
+    for (j = 0; j < numInCh; j++)
+    {
+      minCh = (numOutCh - i) >= XCHAL_IVPN_SIMD_WIDTH ? XCHAL_IVPN_SIMD_WIDTH : (numOutCh - i);
+      for (k = 0; k < minCh; k++)
+      {
+        int16_t val = *(pCoeff + (k + i) * coeffInPitch3 + j * coeffInPitch1);
+        *(pCoeffOut + k + (j * XCHAL_IVPN_SIMD_WIDTH) + i * numInCh) = val;
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/*********************** xaiReOrder4DToIN64DWH_I8 ***************************/
+/* Description : C-code implementation to reorder a tile from WHDN,        */
+/*               DWHN or NDWH into IN64DWH format                          */
+/* Inputs      : Coeff Tile  in WHDN or DWHN or NDWH format                */
+/* Outputs     : Coeff Array in IN64DWH format                             */
+/* Assumptions : The width and height of the coefficient tile are 1        */
+/*               Input and Output tiles can be S16 / U16                   */
+/***************************************************************************/
+XAI_ERR_TYPE xaiReOrder4DToIN64DWH_I8(xai_pTile4D coeffTileIn, xai_pTile4D coeffTileOut)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE4D_I8(coeffTileIn);
+    XAI_CHECK_TILE4D_I8(coeffTileOut);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN) ||                                                       \
+                    (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN) || (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_NDWH), \
+                    XAI_ERR_BADARG, "The Data Order of the input  is not supported by this function");
+    XAI_CHECK_TILE4D_DATA_ORDER(coeffTileOut, XAI_IN64DWH);
+    XAI_CHECK_DIM_IN64DWH(coeffTileIn, coeffTileOut);
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DATA_PTR(coeffTileIn) != XAI_ARRAY_GET_DATA_PTR(coeffTileOut)), XAI_ERR_INPLACE, "The input and output tile pointers overlap");
+  }
+  int32_t numInCh, numOutCh, minCh, coeffInPitch1, coeffInPitch3;
+
+  if (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN)
+  {
+    numInCh       = XAI_TILE4D_GET_DIM3(coeffTileIn);
+    numOutCh      = XAI_TILE4D_GET_DIM4(coeffTileIn);
+    coeffInPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTileIn);
+    coeffInPitch1 = 1;
+  }
+  else if (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN)
+  {
+    numInCh       = XAI_TILE4D_GET_DIM1(coeffTileIn);
+    numOutCh      = XAI_TILE4D_GET_DIM4(coeffTileIn);
+    coeffInPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTileIn);
+    coeffInPitch1 = 1;
+  }
+  else /* If coeff tile NDWH */
+  {
+    numInCh       = XAI_TILE4D_GET_DIM2(coeffTileIn);
+    numOutCh      = XAI_TILE4D_GET_DIM1(coeffTileIn);
+    coeffInPitch3 = 1;
+    coeffInPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTileIn);
+  }
+
+  int8_t *pCoeff    = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTileIn);
+  int8_t *pCoeffOut = (int8_t *) XAI_ARRAY_GET_DATA_PTR(coeffTileOut);
+  int32_t i, j, k;
+
+  /* Reorder Coeff tile */
+  /*
+     The coefficient tile is reordered in the format IN64DWH:
+     d0_0,....d0_63, d1_0,...d1_63, ....dN_0,...dN_63, d0_64,....d0_127, d1_64,...d1_127, ....dN_64,...dN_127,
+     d0_128,....d0_191, d1_128,...d1_191, ....dN_128,...dN_191,...
+
+     Here, d0, d1,....dN are input channels.
+     where 'N' is the total input channels.
+     d0_0 => 0_0 => inputChNumber_outputChNumber
+   */
+
+  for (i = 0; i < numOutCh; i += 2 * XCHAL_IVPN_SIMD_WIDTH)
+  {
+    for (j = 0; j < numInCh; j++)
+    {
+      minCh = (numOutCh - i) >= (2 * XCHAL_IVPN_SIMD_WIDTH) ? (2 * XCHAL_IVPN_SIMD_WIDTH) : (numOutCh - i);
+      for (k = 0; k < minCh; k++)
+      {
+        int8_t val = *(pCoeff + (k + i) * coeffInPitch3 + j * coeffInPitch1);
+        *(pCoeffOut + k + (j * 2 * XCHAL_IVPN_SIMD_WIDTH) + i * numInCh) = val;
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+#if 0 //(XCHAL_HAVE_VISION_HP_VFPU == 1) // Disabled the F16 helper APIs which are not used anywhere
+
+/****************************************************************************/
+/* Description : Implementation for extending the bias array in             */
+/*               case of MOD deconvolution using superkernels.              */
+/* Inputs      : Input Bias array,                                          */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Output Bias array                                          */
+/****************************************************************************/
+XAI_ERR_TYPE xaiBiasExtend_F16_MOD(const xai_pArray inBiasArray,
+                                   xai_pArray outBiasArray)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_ARRAY_F16(inBiasArray);
+    XAI_CHECK_ARRAY_F16(outBiasArray);
+  }
+
+  int32_t inWidth  = XAI_ARRAY_GET_WIDTH(inBiasArray);
+  int32_t outWidth = XAI_ARRAY_GET_WIDTH(outBiasArray);
+  int32_t strideX  = outWidth / inWidth;
+
+  xb_f16* pInBias  = (xb_f16 *) XAI_ARRAY_GET_DATA_PTR(inBiasArray);
+  xb_f16* pOutBias = (xb_f16 *) XAI_ARRAY_GET_DATA_PTR(outBiasArray);
+
+  int32_t numX, inW;
+  for (numX = 0; numX < strideX; numX++)
+  {
+    for (inW = 0; inW < inWidth; inW++)
+    {
+      pOutBias[inW + inWidth * numX] = pInBias[inW];
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/*****************************************************************************/
+/* Description : Implementation for extending the outputscale array          */
+/*               in case of MOD deconvolution using superkernels.            */
+/* Inputs      : outputScale array,                                          */
+/* Outputs     : XI Error Code                                               */
+/* InOuts      : extended outputScale array                                  */
+/*****************************************************************************/
+XAI_ERR_TYPE xaiOutScaleExtend_F16_MOD(const xai_pArray outScaleArray,
+                                       xai_pArray extendedOutScaleArray)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_ARRAY_F16(outScaleArray);
+    XAI_CHECK_ARRAY_F16(extendedOutScaleArray);
+  }
+
+  int32_t inWidth  = XAI_ARRAY_GET_WIDTH(outScaleArray);
+  int32_t outWidth = XAI_ARRAY_GET_WIDTH(extendedOutScaleArray);
+  int32_t strideX  = outWidth / inWidth;
+
+  xb_f16* pInScale  = (xb_f16 *) XAI_ARRAY_GET_DATA_PTR(outScaleArray);
+  xb_f16* pOutScale = (xb_f16 *) XAI_ARRAY_GET_DATA_PTR(extendedOutScaleArray);
+
+  int32_t numX, inW;
+  for (numX = 0; numX < strideX; numX++)
+  {
+    for (inW = 0; inW < inWidth; inW++)
+    {
+      pOutScale[inW + inWidth * numX] = pInScale[inW];
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/****************************************************************************/
+/* Description : Implementation for coefficient reordering                  */
+/*               The functions does the following:                          */
+/*               - Convert from NDWH->DNWH                                  */
+/*               - Flips the coefficients across width and height which is  */
+/*                 controlled by transposeCoeffsFlag.                       */
+/*               - Breaks the kernel into sub-kernels.                      */
+/*               - Stacks sub-kernels to form super kernels.                */
+/* Inputs      : Input Coeff Tile, CNN convolution params structure,        */
+/*               transposeCoeffsFlag                                        */
+/* Outputs     : XI Error Code                                              */
+/* InOuts      : Array of Coeff Sub & Super Tiles                           */
+/* Assumptions : CoeffData is F16                                           */
+/*               Coeff is in NDWH format                                    */
+/****************************************************************************/
+XAI_ERR_TYPE xaiDeConvReOrder4D_F16_NDWH(const xai_pTile4D inTile,
+                                         xai_pTile4D subCoeffs[],
+                                         xai_pTile4D superCoeffs[],
+                                         const xai_cnn_conv_params *param,
+                                         const uint8_t transposeCoeffsFlag)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE4D_F16(inTile);
+    XAI_CHECK_TILE4D_DATA_ORDER(inTile, XAI_NDWH);
+    XAI_CHECK_POINTER(param);
+    XAI_CHECK_POINTER(subCoeffs);
+    XAI_CHECK_POINTER(superCoeffs);
+    XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) >= 1) &&                                                  \
+                     (XAI_CNN_CONV_GET_STRIDEX(param) <= XAI_TILE4D_GET_DIM3(inTile))) &&                       \
+                    ((XAI_CNN_CONV_GET_STRIDEY(param) >= 1) &&                                                  \
+                     (XAI_CNN_CONV_GET_STRIDEY(param) <= XAI_TILE4D_GET_DIM4(inTile))), XAI_ERR_BADARG,         \
+                    "StrideX = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Width) \
+       \nStrideY = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Height)", \
+                    XAI_CNN_CONV_GET_STRIDEX(param), XAI_TILE4D_GET_DIM3(inTile),                               \
+                    XAI_CNN_CONV_GET_STRIDEY(param), XAI_TILE4D_GET_DIM4(inTile));
+    XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) == 1), \
+                    XAI_ERR_BADARG, "\nDilation is %hhu\nDilation parameter should be equal to 1", XAI_CNN_CONV_GET_DILATION(param));
+    XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param),                          \
+                    XAI_ERR_BADARG, "\nDilation along width is %hhu and dilation along height is %hhu are not same", \
+                    XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param));
+  }
+
+  int32_t kIdx, kIdy;
+  int32_t kernelIdx;
+
+  XAI_ERROR_CHECKS_CONTINUE()
+  {
+    for (kIdy = 0; kIdy < XAI_CNN_CONV_GET_STRIDEY(param); kIdy++)
+    {
+      for (kIdx = 0; kIdx < XAI_CNN_CONV_GET_STRIDEX(param); kIdx++)
+      {
+        kernelIdx = kIdy * XAI_CNN_CONV_GET_STRIDEX(param) + kIdx;
+        XAI_CHECK_TILE4D_F16(subCoeffs[kernelIdx]);
+        XAI_CHECK_TILE4D_DATA_ORDER(subCoeffs[kernelIdx], XAI_NDWH);
+      }
+      XAI_CHECK_TILE4D_F16(superCoeffs[kIdy]);
+      XAI_CHECK_TILE4D_DATA_ORDER(superCoeffs[kIdy], XAI_NDWH);
+    }
+  }
+
+  xb_f16 *pInCoeff = (xb_f16 *) XAI_TILE4D_GET_DATA_PTR(inTile);
+
+  const int32_t numOutCh = XAI_TILE4D_GET_DIM1(inTile); /* N */
+  const int32_t numInCh  = XAI_TILE4D_GET_DIM2(inTile); /* D */
+  const int32_t kWidth   = XAI_TILE4D_GET_DIM3(inTile); /* W */
+  const int32_t kHeight  = XAI_TILE4D_GET_DIM4(inTile); /* H */
+
+  const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param);
+  const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param);
+
+  int32_t inCoeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(inTile);
+  int32_t inCoeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(inTile);
+  int32_t inCoeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(inTile);
+
+  int32_t kx, ky, inCh, outCh, inIdx, outIdx = 0;
+  xb_f16 *pSuperCoeff;
+  xb_f16 *pSubCoeff;
+  int32_t subKPitch1, subKPitch2, subKPitch3;
+  int32_t superKPitch1, superKPitch2;
+  int32_t kW, kH, subkW;
+  int32_t numInChSubCoeff;
+  int32_t subKIdx;
+
+  int32_t kxStart, kyStart;
+
+  if (transposeCoeffsFlag)
+  {
+    /* Conversion from NDWH -> DNWH,                       */
+    /* transposing of kernels and formation of sub-kernels */
+    for (kIdy = 0; kIdy < strideY; kIdy++)
+    {
+      for (kIdx = 0; kIdx < strideX; kIdx++)
+      {
+        kernelIdx = kIdy * strideX + kIdx;
+        xb_f16 *pSubCoeff = (xb_f16 *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx]);
+
+        outIdx  = 0;
+        kyStart = kHeight - 1 - ((kHeight + strideY - kIdy - 1) % strideY);
+
+        for (ky = kyStart; ky >= 0; ky -= strideY)          /* H */
+        {
+          kxStart = kWidth - 1 - ((kWidth + strideX - kIdx - 1) % strideX);
+
+          for (kx = kxStart; kx >= 0; kx -= strideX)        /* W */
+          {
+            for (outCh = 0; outCh < numOutCh; outCh++)      /* N */
+            {
+              for (inCh = 0; inCh < numInCh; inCh++)        /* D */
+              {
+                inIdx = ky * inCoeffPitch3 + kx * inCoeffPitch2 + \
+                        inCh * inCoeffPitch1 + outCh;
+                pSubCoeff[outIdx++] = pInCoeff[inIdx];
+              }
+              /* For stride alignment */
+              outIdx += (outIdx % (XCHAL_IVPN_SIMD_WIDTH)) ? ((XCHAL_IVPN_SIMD_WIDTH) -(outIdx % (XCHAL_IVPN_SIMD_WIDTH))) : 0;
+            }
+          }
+        }
+      }
+    }
+  }
+  else
+  {
+    /* Conversion from NDWH -> DNWH and formation of sub-kernels */
+    for (kIdy = 0; kIdy < strideY; kIdy++)
+    {
+      for (kIdx = 0; kIdx < strideX; kIdx++)
+      {
+        kernelIdx = kIdy * strideX + kIdx;
+        xb_f16 *pSubCoeff = (xb_f16 *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx]);
+
+        outIdx  = 0;
+        kyStart = ((kHeight + strideY - kIdy - 1) % strideY);
+
+        for (ky = kyStart; ky < kHeight; ky += strideY)          /* H */
+        {
+          kxStart = ((kWidth + strideX - kIdx - 1) % strideX);
+
+          for (kx = kxStart; kx < kWidth; kx += strideX)         /* W */
+          {
+            for (outCh = 0; outCh < numOutCh; outCh++)           /* N */
+            {
+              for (inCh = 0; inCh < numInCh; inCh++)             /* D */
+              {
+                inIdx = ky * inCoeffPitch3 + kx * inCoeffPitch2 + \
+                        inCh * inCoeffPitch1 + outCh;
+                pSubCoeff[outIdx++] = pInCoeff[inIdx];
+              }
+              /* For stride alignment */
+              outIdx += (outIdx % (XCHAL_IVPN_SIMD_WIDTH)) ? ((XCHAL_IVPN_SIMD_WIDTH) -(outIdx % (XCHAL_IVPN_SIMD_WIDTH))) : 0;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /* Form super-kernels by stacking sub-kernels */
+  for (kernelIdx = 0; kernelIdx < strideY; kernelIdx++)
+  {
+    pSuperCoeff = (xb_f16 *) XAI_TILE4D_GET_DATA_PTR(superCoeffs[kernelIdx]);
+
+    kW = XAI_TILE4D_GET_DIM3(superCoeffs[kernelIdx]);
+    kH = XAI_TILE4D_GET_DIM4(superCoeffs[kernelIdx]);
+
+    numInChSubCoeff = XAI_TILE4D_GET_DIM1(subCoeffs[kernelIdx * strideX]);
+    superKPitch1    = XAI_TILE4D_GET_DIM1_PITCH(superCoeffs[kernelIdx]);
+    superKPitch2    = XAI_TILE4D_GET_DIM2_PITCH(superCoeffs[kernelIdx]);
+
+    for (subKIdx = 0; subKIdx < strideX; subKIdx++)
+    {
+      pSubCoeff = (xb_f16 *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx * strideX + subKIdx]);
+
+      subkW = XAI_TILE4D_GET_DIM3(subCoeffs[kernelIdx * strideX + subKIdx]);
+
+      subKPitch1 = XAI_TILE4D_GET_DIM1_PITCH(subCoeffs[kernelIdx * strideX + subKIdx]);
+      subKPitch2 = XAI_TILE4D_GET_DIM2_PITCH(subCoeffs[kernelIdx * strideX + subKIdx]);
+      subKPitch3 = XAI_TILE4D_GET_DIM3_PITCH(subCoeffs[kernelIdx * strideX + subKIdx]);
+
+      outIdx = numInChSubCoeff * subKIdx;
+
+      for (ky = 0, kIdy = 0; ky < kH; ky++, kIdy++)          /* H */
+      {
+        for (kx = 0, kIdx = 0; kx < kW; kx++, kIdx++)        /* W */
+        {
+          /*In case of super kernels we have the first sub kernel width/height as the width/height of the superkernel     */
+          /*In case the widths of the subkernel are not equal then we skip by differnce and start filling                 */
+          /*Once the convolution is done the output junk data apprears at the end of the outtile.                         */
+          /*In case of unequal heights this is handled using pointers in test app.                                        */
+          if ((subkW < kW) && (kx == 0))
+          {
+            outIdx += superKPitch2;
+            kIdx--;
+            continue;
+          }
+          for (outCh = 0; outCh < numOutCh; outCh++)         /* N */
+          {
+            for (inCh = 0; inCh < numInChSubCoeff; inCh++)   /* D */
+            {
+              inIdx = kIdy * subKPitch3 + kIdx * subKPitch2 + \
+                      outCh * subKPitch1 + inCh;
+              pSuperCoeff[outIdx++] = pSubCoeff[inIdx];
+            }
+            outIdx += (superKPitch1 - numInChSubCoeff);
+          }
+        }
+      }
+    }
+  }
+  return(XAI_ERROR_STATUS());
+}
+#endif //if (XCHAL_HAVE_VISION_HP_VFPU == 1)
+#endif //if ((XCHAL_VISION_TYPE >= 6))
+
diff --git a/backends/cadence/vision/third-party/libxai/include/xai_cnn.h b/backends/cadence/vision/third-party/libxai/include/xai_cnn.h
new file mode 100644
index 00000000000..2ba56fe0e98
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/include/xai_cnn.h
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#ifndef __XAI_CNN_H__
+#define __XAI_CNN_H__
+
+#include "xai_cnn_api.h"
+#include "xai_cnn_common.h"
+#include "xai_tile_manager.h"
+#include "xai_core.h"
+#include "limits.h"
+
+/****************************************************************************/
+/* MACROS :                                                                 */
+/* Macro for Packing the accumulator output after convolution, scaling it,  */
+/* shifting and clamping the final output between min and max limits        */
+/****************************************************************************/
+#define PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1, dvecOut2, daccSum, packSA, outScaleDataEven, outScaleDataOdd, outSh, min, max, flag)  { \
+    xb_vecNx16 m_outEven = IVP_PACKVR2NX24_0(daccSum, packSA);                                                                             \
+    xb_vecNx16 m_outOdd  = IVP_PACKVR2NX24_1(daccSum, packSA);                                                                             \
+    xb_vecNx48 m_wvec    = IVP_MULUSNX16(outScaleDataEven, m_outEven);                                                                     \
+    m_outEven = IVP_PACKVRNX48(m_wvec, outSh);                                                                                             \
+    m_wvec    = IVP_MULUSNX16(outScaleDataOdd, m_outOdd);                                                                                  \
+    m_outOdd  = IVP_PACKVRNX48(m_wvec, outSh);                                                                                             \
+    m_outEven = IVP_MAXNX16(IVP_MINNX16(m_outEven, (xb_vecNx16) max), (xb_vecNx16) min);                                                   \
+    m_outOdd  = IVP_MAXNX16(IVP_MINNX16(m_outOdd, (xb_vecNx16) max), (xb_vecNx16) min);                                                    \
+    xb_vec2Nx8 m_dvec = IVP_SEL2NX8I(IVP_MOV2NX8_FROMNX16(m_outOdd),                                                                       \
+                                     IVP_MOV2NX8_FROMNX16(m_outEven),                                                                      \
+                                     IVP_SELI_8B_INTERLEAVE_1_EVEN);                                                                       \
+    IVP_DSEL2NX8I(dvecOut2, dvecOut1, IVP_MOV2NX8_FROMNX16(m_outOdd),                                                                      \
+                  IVP_MOV2NX8_FROMNX16(m_outEven),                                                                                         \
+                  IVP_DSELI_INTERLEAVE_1);                                                                                                 \
+    dvecOut1 = IVP_MOV2NX8T(dvecOut1, m_dvec, IVP_EQ2NX8((xb_vec2Nx8) flag, 1));                                                           \
+}
+
+#define PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1, dvecOut2, daccSum, packSA, outSc, outSh, min, max, flag) \
+  PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1, dvecOut2, daccSum, packSA, outSc, outSc, outSh, min, max, flag)
+
+/****************************************************************************/
+/* MACROS :                                                                 */
+/* Macro for Packing the accumulator output after convolution, scaling it,  */
+/* shifting and clamping the final output between min and max limits        */
+/****************************************************************************/
+#define PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut, accSum, packSA, outSc, outSh, min, max)  {       \
+    vecOut = IVP_PACKVRNX48(accSum, packSA);                                                       \
+    xb_vecNx48 m_wvec       = IVP_MULUSNX16(outSc, vecOut);                                        \
+    xb_vecN_2x32v m_outEven = IVP_PACKVRNX48_0(m_wvec, outSh);                                     \
+    xb_vecN_2x32v m_outOdd  = IVP_PACKVRNX48_1(m_wvec, outSh);                                     \
+    m_outEven = IVP_MAXN_2X32(IVP_MINN_2X32(m_outEven, (xb_vecN_2x32v) max), (xb_vecN_2x32v) min); \
+    m_outOdd  = IVP_MAXN_2X32(IVP_MINN_2X32(m_outOdd, (xb_vecN_2x32v) max), (xb_vecN_2x32v) min);  \
+    vecOut    = IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32(m_outOdd),                                     \
+                             IVP_MOVNX16_FROMN_2X32(m_outEven),                                    \
+                             IVP_SELI_INTERLEAVE_1_EVEN);                                          \
+}
+
+#define PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut, accSum, packSA, vecScaleData, outSh, min, max) \
+  PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut, accSum, packSA, vecScaleData, outSh, min, max)
+
+#define PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut1, dvecOut2, hvecSumLL, hvecSumLH, hvecSumHL, hvecSumHH, packSA, outSc, outSh, min, max, flag)                            { \
+    xb_vecNx48 vecSumL = IVP_CVT48SNX32(hvecSumLH, hvecSumLL);                                                                                                                \
+    xb_vecNx48 vecSumH = IVP_CVT48SNX32(hvecSumHH, hvecSumHL);                                                                                                                \
+    xb_vecNx16 m_outL  = IVP_PACKVRNX48(vecSumL, packSA);                                                                                                                     \
+    xb_vecNx16 m_outH  = IVP_PACKVRNX48(vecSumH, packSA);                                                                                                                     \
+    xb_vecNx48 m_wvec  = IVP_MULUSNX16((xb_vecNx16U) outSc, m_outL);                                                                                                          \
+    m_outL = IVP_PACKVRNX48(m_wvec, outSh);                                                                                                                                   \
+    m_outL = IVP_MAXNX16(IVP_MINNX16(m_outL, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);                                                                                      \
+    m_wvec = IVP_MULUSNX16((xb_vecNx16U) outSc, m_outH);                                                                                                                      \
+    m_outH = IVP_PACKVRNX48(m_wvec, outSh);                                                                                                                                   \
+    m_outH = IVP_MAXNX16(IVP_MINNX16(m_outH, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);                                                                                      \
+    xb_vec2Nx8 m_dvec = IVP_SEL2NX8I(IVP_MOV2NX8_FROMNX16(m_outH), IVP_MOV2NX8_FROMNX16(m_outL), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);                                           \
+    dvecOut1 = IVP_MOV2NX8_FROMNX16(m_outL);                                                                                                                                  \
+    dvecOut1 = IVP_MOV2NX8T(dvecOut1, m_dvec, IVP_EQ2NX8(flag, 1));                                                                                                           \
+    dvecOut2 = IVP_MOV2NX8_FROMNX16(m_outH);                                                                                                                                  \
+}
+
+#define PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut1, dvecOut2, hvecSumLL, hvecSumLH, hvecSumHL, hvecSumHH, packSA, outScaleDataL, outScaleDataH, outSh, min, max, flag)  { \
+    xb_vecNx48 vecSumL = IVP_CVT48SNX32(hvecSumLH, hvecSumLL);                                                                                                                \
+    xb_vecNx48 vecSumH = IVP_CVT48SNX32(hvecSumHH, hvecSumHL);                                                                                                                \
+    xb_vecNx16 m_outL  = IVP_PACKVRNX48(vecSumL, packSA);                                                                                                                     \
+    xb_vecNx16 m_outH  = IVP_PACKVRNX48(vecSumH, packSA);                                                                                                                     \
+    xb_vecNx48 m_wvec  = IVP_MULUSNX16((xb_vecNx16U) outScaleDataL, m_outL);                                                                                                  \
+    m_outL = IVP_PACKVRNX48(m_wvec, outSh);                                                                                                                                   \
+    m_outL = IVP_MAXNX16(IVP_MINNX16(m_outL, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);                                                                                      \
+    m_wvec = IVP_MULUSNX16((xb_vecNx16U) outScaleDataH, m_outH);                                                                                                              \
+    m_outH = IVP_PACKVRNX48(m_wvec, outSh);                                                                                                                                   \
+    m_outH = IVP_MAXNX16(IVP_MINNX16(m_outH, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);                                                                                      \
+    xb_vec2Nx8 m_dvec = IVP_SEL2NX8I(IVP_MOV2NX8_FROMNX16(m_outH), IVP_MOV2NX8_FROMNX16(m_outL), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);                                           \
+    dvecOut1 = IVP_MOV2NX8_FROMNX16(m_outL);                                                                                                                                  \
+    dvecOut1 = IVP_MOV2NX8T(dvecOut1, m_dvec, IVP_EQ2NX8(flag, 1));                                                                                                           \
+    dvecOut2 = IVP_MOV2NX8_FROMNX16(m_outH);                                                                                                                                  \
+}
+
+/****************************************************************************/
+/* MACROS :                                                                 */
+/* Macro for Packing the accumulator output after convolution, scaling it,  */
+/* shifting and clamping the final output between min and max limits        */
+/****************************************************************************/
+#define PACK_SCALE_SHIFT_CLAMP_LIMITS_IXS16(dvecOut1, dvecOut2, hvecSumLL, hvecSumLH, hvecSumHL, hvecSumHH, packSA, outScaleDataL, outScaleDataH, outSh, min, max, flag, sel)  { \
+    xb_vecNx16 hvecSum1, hvecSum2, hvecSum3, hvecSum4;                                                                                                                           \
+    IVP_DSELNX16(hvecSum3, hvecSum1, IVP_MOVNX16_FROMN_2X32(hvecSumLH), IVP_MOVNX16_FROMN_2X32(hvecSumLL), sel);                                                                 \
+    IVP_DSELNX16(hvecSum4, hvecSum2, IVP_MOVNX16_FROMN_2X32(hvecSumHH), IVP_MOVNX16_FROMN_2X32(hvecSumHL), sel);                                                                 \
+    xb_vecNx48 vecSumL = IVP_CVT48SNX32(IVP_MOVN_2X32_FROMNX16(hvecSum2), IVP_MOVN_2X32_FROMNX16(hvecSum1));                                                                     \
+    xb_vecNx48 vecSumH = IVP_CVT48SNX32(IVP_MOVN_2X32_FROMNX16(hvecSum4), IVP_MOVN_2X32_FROMNX16(hvecSum3));                                                                     \
+    xb_vecNx16 m_outL  = IVP_PACKVRNX48(vecSumL, packSA);                                                                                                                        \
+    xb_vecNx16 m_outH  = IVP_PACKVRNX48(vecSumH, packSA);                                                                                                                        \
+    xb_vecNx48 m_wvec  = IVP_MULUSNX16((xb_vecNx16U) outScaleDataL, m_outL);                                                                                                     \
+    m_outL = IVP_PACKVRNX48(m_wvec, outSh);                                                                                                                                      \
+    m_outL = IVP_MAXNX16(IVP_MINNX16(m_outL, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);                                                                                         \
+    m_wvec = IVP_MULUSNX16((xb_vecNx16U) outScaleDataH, m_outH);                                                                                                                 \
+    m_outH = IVP_PACKVRNX48(m_wvec, outSh);                                                                                                                                      \
+    m_outH = IVP_MAXNX16(IVP_MINNX16(m_outH, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);                                                                                         \
+    xb_vec2Nx8 m_dvec = IVP_SEL2NX8I(IVP_MOV2NX8_FROMNX16(m_outH), IVP_MOV2NX8_FROMNX16(m_outL), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);                                              \
+    dvecOut1 = IVP_MOV2NX8_FROMNX16(m_outL);                                                                                                                                     \
+    dvecOut1 = IVP_MOV2NX8T(dvecOut1, m_dvec, IVP_EQ2NX8(flag, 1));                                                                                                              \
+    dvecOut2 = IVP_MOV2NX8_FROMNX16(m_outH);                                                                                                                                     \
+}
+
+#define PACK_SCALE_SHIFT_CLAMP_LIMITS_S16S8(dvecOut1, dvecOut2, hvecSumLL, hvecSumLH, hvecSumHL, hvecSumHH, packSA, outScaleDataL, outScaleDataH, outSh, min, max, flag)       { \
+    xb_vecNx48 vecSumL = IVP_CVT48SNX32(hvecSumLH, hvecSumLL);                                                                                                                   \
+    xb_vecNx48 vecSumH = IVP_CVT48SNX32(hvecSumHH, hvecSumHL);                                                                                                                   \
+    xb_vecNx16 m_outL  = IVP_PACKVRNX48(vecSumL, packSA);                                                                                                                        \
+    xb_vecNx16 m_outH  = IVP_PACKVRNX48(vecSumH, packSA);                                                                                                                        \
+    xb_vecNx48 m_wvec  = IVP_MULUSNX16((xb_vecNx16U) outScaleDataL, m_outL);                                                                                                     \
+    m_outL = IVP_PACKVRNX48(m_wvec, outSh);                                                                                                                                      \
+    m_outL = IVP_MAXNX16(IVP_MINNX16(m_outL, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);                                                                                         \
+    m_wvec = IVP_MULUSNX16((xb_vecNx16U) outScaleDataH, m_outH);                                                                                                                 \
+    m_outH = IVP_PACKVRNX48(m_wvec, outSh);                                                                                                                                      \
+    m_outH = IVP_MAXNX16(IVP_MINNX16(m_outH, (xb_vecNx16) maxLim), (xb_vecNx16) minLim);                                                                                         \
+    xb_vec2Nx8 m_dvec = IVP_SEL2NX8I(IVP_MOV2NX8_FROMNX16(m_outH), IVP_MOV2NX8_FROMNX16(m_outL), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);                                              \
+    dvecOut1 = IVP_MOV2NX8_FROMNX16(m_outL);                                                                                                                                     \
+    dvecOut1 = IVP_MOV2NX8T(dvecOut1, m_dvec, IVP_EQ2NX8(flag, 1));                                                                                                              \
+    dvecOut2 = IVP_MOV2NX8_FROMNX16(m_outH);                                                                                                                                     \
+}
+
+/****************************************************************************/
+/* MACROS :                                                                 */
+/* Macro for Packing the 24- bit accumulator output to 16-bit               */
+/* shifting and clamping the final output between min and max limits        */
+/****************************************************************************/
+
+#define PACK_SCALE_SHIFT_S24_S16(accdotProd, scale1, accShift1,                           \
+                                 vecClampL, vecClampH, vecScale1L, vecScale1H, shift1)  { \
+    xb_vecN_2x32v vecaccHH = IVP_CVT32S2NX24HH(accdotProd);                               \
+    xb_vecN_2x32v vecaccHL = IVP_CVT32S2NX24HL(accdotProd);                               \
+    xb_vecN_2x32v vecaccLH = IVP_CVT32S2NX24LH(accdotProd);                               \
+    xb_vecN_2x32v vecaccLL = IVP_CVT32S2NX24LL(accdotProd);                               \
+    xb_vecN_2x64w haccA, haccB, haccC, haccD;                                             \
+    haccA = IVP_MULN_2X16X32_0(scale1, vecaccLL);                                         \
+    haccB = IVP_MULN_2X16X32_0(scale1, vecaccLH);                                         \
+    haccC = IVP_MULN_2X16X32_0(scale1, vecaccHL);                                         \
+    haccD = IVP_MULN_2X16X32_0(scale1, vecaccHH);                                         \
+    xb_vecN_2x32v hvec0LL = IVP_PACKVRN_2X64W(haccA, accShift1);                          \
+    xb_vecN_2x32v hvec0LH = IVP_PACKVRN_2X64W(haccB, accShift1);                          \
+    xb_vecN_2x32v hvec0HL = IVP_PACKVRN_2X64W(haccC, accShift1);                          \
+    xb_vecN_2x32v hvec0HH = IVP_PACKVRN_2X64W(haccD, accShift1);                          \
+    xb_vecNx48 accA       = IVP_CVT48SNX32(hvec0LH, hvec0LL);                             \
+    xb_vecNx48 accB       = IVP_CVT48SNX32(hvec0HH, hvec0HL);                             \
+    vecClampL  = IVP_PACKVRNX48(accA, 0);                                                 \
+    vecClampH  = IVP_PACKVRNX48(accB, 0);                                                 \
+    accdotProd = IVP_CVT24S2NX16(vecClampH, vecClampL);                                   \
+    xb_vecNx16U vecScaleLL = IVP_SELNX16UI(0, vecScale1L, IVP_SELI_INTERLEAVE_1_LO);      \
+    xb_vecNx16U vecScaleLH = IVP_SELNX16UI(0, vecScale1L, IVP_SELI_INTERLEAVE_1_HI);      \
+    xb_vecNx16U vecScaleHL = IVP_SELNX16UI(0, vecScale1H, IVP_SELI_INTERLEAVE_1_LO);      \
+    xb_vecNx16U vecScaleHH = IVP_SELNX16UI(0, vecScale1H, IVP_SELI_INTERLEAVE_1_HI);      \
+    vecaccHH  = IVP_CVT32S2NX24HH(accdotProd);                                            \
+    vecaccHL  = IVP_CVT32S2NX24HL(accdotProd);                                            \
+    vecaccLH  = IVP_CVT32S2NX24LH(accdotProd);                                            \
+    vecaccLL  = IVP_CVT32S2NX24LL(accdotProd);                                            \
+    haccA     = IVP_MULUSN_2X16X32_0(vecScaleLL, vecaccLL);                               \
+    haccB     = IVP_MULUSN_2X16X32_0(vecScaleLH, vecaccLH);                               \
+    haccC     = IVP_MULUSN_2X16X32_0(vecScaleHL, vecaccHL);                               \
+    haccD     = IVP_MULUSN_2X16X32_0(vecScaleHH, vecaccHH);                               \
+    hvec0LL   = IVP_PACKVRN_2X64W(haccA, shift1);                                         \
+    hvec0LH   = IVP_PACKVRN_2X64W(haccB, shift1);                                         \
+    hvec0HL   = IVP_PACKVRN_2X64W(haccC, shift1);                                         \
+    hvec0HH   = IVP_PACKVRN_2X64W(haccD, shift1);                                         \
+    accA      = IVP_CVT48SNX32(hvec0LH, hvec0LL);                                         \
+    accB      = IVP_CVT48SNX32(hvec0HH, hvec0HL);                                         \
+    vecClampL = IVP_PACKVRNX48(accA, 0);                                                  \
+    vecClampH = IVP_PACKVRNX48(accB, 0);                                                  \
+}
+
+#define PACK_SCALE_SHIFT_S48_S8(accProd, accShift2, scale2L, shift2, vecRescale)        {                        \
+    xb_vecN_2x64w wvecAccL = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(accProd), IVP_CVT64SNX48LL(accProd));             \
+    xb_vecN_2x64w wvecAccH = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(accProd), IVP_CVT64SNX48HL(accProd));             \
+    accProd    = IVP_CVT48SNX32(IVP_PACKVRN_2X64W(wvecAccH, accShift2), IVP_PACKVRN_2X64W(wvecAccL, accShift2)); \
+    vecRescale = IVP_PACKVRNX48(accProd, 0);                                                                     \
+    accProd    = IVP_MULUSNX16(scale2L, vecRescale);                                                             \
+    vecRescale = IVP_PACKVRNX48(accProd, shift2);                                                                \
+}
+
+#define PACK_SCALE_SHIFT_S32_S8(inReg, inReg1, scale2, shift2, seq1, dvecOut)           { \
+    xb_vecN_2x64w m_wvec = IVP_MULUSN_2X16X32_0((xb_vecNx16U) scale2, inReg);             \
+    xb_vecN_2x32v m_outL = IVP_PACKVRN_2X64W(m_wvec, shift2);                             \
+    m_outL = IVP_MAXN_2X32(IVP_MINN_2X32(m_outL, SCHAR_MAX), SCHAR_MIN);                  \
+    m_wvec = IVP_MULUSN_2X16X32_1((xb_vecNx16U) scale2, inReg1);                          \
+    xb_vecN_2x32v m_outH = IVP_PACKVRN_2X64W(m_wvec, shift2);                             \
+    m_outH  = IVP_MAXN_2X32(IVP_MINN_2X32(m_outH, SCHAR_MAX), SCHAR_MIN);                 \
+    dvecOut = IVP_SEL2NX8(IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(m_outH)),           \
+                          IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(m_outL)), seq1);    \
+}
+
+#define PACK_SCALE_SHIFT_S32_S8(inReg, inReg1, scale2, shift2, seq1, dvecOut)           { \
+    xb_vecN_2x64w m_wvec = IVP_MULUSN_2X16X32_0((xb_vecNx16U) scale2, inReg);             \
+    xb_vecN_2x32v m_outL = IVP_PACKVRN_2X64W(m_wvec, shift2);                             \
+    m_outL = IVP_MAXN_2X32(IVP_MINN_2X32(m_outL, SCHAR_MAX), SCHAR_MIN);                  \
+    m_wvec = IVP_MULUSN_2X16X32_1((xb_vecNx16U) scale2, inReg1);                          \
+    xb_vecN_2x32v m_outH = IVP_PACKVRN_2X64W(m_wvec, shift2);                             \
+    m_outH  = IVP_MAXN_2X32(IVP_MINN_2X32(m_outH, SCHAR_MAX), SCHAR_MIN);                 \
+    dvecOut = IVP_SEL2NX8(IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(m_outH)),           \
+                          IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(m_outL)), seq1);    \
+}
+
+#define ACC_INIT_BIAS(phvecBias, numBias, daccSum0, daccSum1, daccSum2, daccSum3)       {    \
+    xb_vecN_2x32v hvecBias1, hvecBias2;                                                      \
+    valign vaBias = IVP_LAN_2X32_PP(phvecBias);                                              \
+    IVP_LAVN_2X32_XP(hvecBias1, vaBias, phvecBias, 4 * numBias);                             \
+    IVP_LAVN_2X32_XP(hvecBias2, vaBias, phvecBias, 4 * numBias - 2 * XCHAL_IVPN_SIMD_WIDTH); \
+    daccSum0 = IVP_CVT24UNX32L(hvecBias2, hvecBias1);                                        \
+    daccSum1 = IVP_CVT24UNX32L(hvecBias2, hvecBias1);                                        \
+    daccSum2 = IVP_CVT24UNX32L(hvecBias2, hvecBias1);                                        \
+    daccSum3 = IVP_CVT24UNX32L(hvecBias2, hvecBias1);                                        \
+    IVP_LAVN_2X32_XP(hvecBias1, vaBias, phvecBias, 4 * numBias - 4 * XCHAL_IVPN_SIMD_WIDTH); \
+    IVP_LAVN_2X32_XP(hvecBias2, vaBias, phvecBias, 4 * numBias - 6 * XCHAL_IVPN_SIMD_WIDTH); \
+    IVP_CVT24UNX32H(daccSum0, hvecBias2, hvecBias1);                                         \
+    IVP_CVT24UNX32H(daccSum1, hvecBias2, hvecBias1);                                         \
+    IVP_CVT24UNX32H(daccSum2, hvecBias2, hvecBias1);                                         \
+    IVP_CVT24UNX32H(daccSum3, hvecBias2, hvecBias1);                                         \
+}
+
+#define ACC_INIT_BIAS64_MOD_ONEACC(pdvecBias, vaBias, numBias, accSum64)                {    \
+    xb_vec2Nx8 m_dvecBias1, m_dvecBias2, m_dvecBias3, m_dvecBias4;                           \
+    IVP_LAV2NX8_XP(m_dvecBias1, vaBias, pdvecBias, numBias * 8);                             \
+    IVP_LAV2NX8_XP(m_dvecBias2, vaBias, pdvecBias, numBias * 8 - 2 * XCHAL_IVPN_SIMD_WIDTH); \
+    IVP_LAV2NX8_XP(m_dvecBias3, vaBias, pdvecBias, numBias * 8 - 4 * XCHAL_IVPN_SIMD_WIDTH); \
+    IVP_LAV2NX8_XP(m_dvecBias4, vaBias, pdvecBias, numBias * 8 - 6 * XCHAL_IVPN_SIMD_WIDTH); \
+    accSum64 = IVP_CVT48UN_2X64L(m_dvecBias2, m_dvecBias1);                                  \
+    IVP_CVT48UN_2X64H(accSum64, m_dvecBias4, m_dvecBias3);                                   \
+}
+
+#define ACC_INIT_BIAS64_MOW_ONEACC(pBias64, vaBias, wvecAcc, flag)                        \
+  {                                                                                       \
+    xb_vec2Nx8 m_dvecBias64; IVP_LAV2NX8_XP(m_dvecBias64, vaBias, pdvecBias64, flag * 8); \
+    m_dvecBias64 = IVP_SHFL2NX8I(m_dvecBias64, IVP_SHFLI_REP_0X4);                        \
+    wvecAcc      = IVP_CVT48UN_2X64L(m_dvecBias64, m_dvecBias64);                         \
+    IVP_CVT48UN_2X64H(wvecAcc, m_dvecBias64, m_dvecBias64);                               \
+  }
+
+#define VQ_INIT_OUTSCALE(pOutScale, numOutScale, vecDataEven, vecDataOdd)  {                   \
+    xb_vecNx16U vecDataL, vecDataH;                                                            \
+    valign vaScale = IVP_LANX16U_PP(pOutScale);                                                \
+    IVP_LAVNX16_XP(vecDataL, vaScale, pOutScale, 2 * numOutScale);                             \
+    IVP_LAVNX16_XP(vecDataH, vaScale, pOutScale, 2 * numOutScale - 2 * XCHAL_IVPN_SIMD_WIDTH); \
+    vecDataEven = IVP_SELNX16UI(vecDataH, vecDataL, IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0);        \
+    vecDataOdd  = IVP_SELNX16UI(vecDataH, vecDataL, IVP_SELI_16B_EXTRACT_1_OF_2_OFF_1);        \
+}
+#endif
diff --git a/backends/cadence/vision/third-party/libxai/include/xai_cnn_api.h b/backends/cadence/vision/third-party/libxai/include/xai_cnn_api.h
new file mode 100644
index 00000000000..07bd4dedbf3
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/include/xai_cnn_api.h
@@ -0,0 +1,7041 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#ifndef __XAI_CNN_API_H__
+#define __XAI_CNN_API_H__
+
+#include "xai_cnn_api_params.h"
+#include "xai_config_api.h"
+#include "xai_core_api.h"
+#include "xai_tile_manager.h"
+#include <math.h>
+#include <stdbool.h>
+
+
+#if ((XCHAL_VISION_TYPE >= 6))
+/***************************************************************************************************/
+/******************************  Fixed Point routines declaration  *********************************/
+/***************************************************************************************************/
+
+/* Convolution wrappper functions */
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D(const xai_pTile3D inTile,
+                                     const xai_pTile4D coeffTile,
+                                     const xai_pArray biasArray,
+                                     xai_pTile3D outTile,
+                                     xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE *xaiGetConvolve3DVariant(const xai_pTile3D inTile,
+                                                const xai_pTile4D coeffTile,
+                                                const xai_pArray biasArray,
+                                                xai_pTile3D outTile,
+                                                xai_cnn_conv_params *param);
+
+/* Convolution MOW*/
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_5x5j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_5x5j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_7x7j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_7x7j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxNj1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxNj1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_5x5j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_5x5j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_7x7j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_7x7j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxNj2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxNj2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_5x5j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_5x5j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_7x7j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_7x7j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxNj4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxNj4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            xai_cnn_conv_params *param);
+
+/* Convolution MOD */
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 xai_pTile3D outTile,
+                                                                 xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 xai_pTile3D outTile,
+                                                                 xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 xai_pTile3D outTile,
+                                                                 xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 xai_pTile3D outTile,
+                                                                 xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 xai_pTile3D outTile,
+                                                                 xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                             const xai_pTile4D coeffTile,
+                                                             const xai_pArray biasArray,
+                                                             xai_pTile3D outTile,
+                                                             xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                             const xai_pTile4D coeffTile,
+                                                             const xai_pArray biasArray,
+                                                             xai_pTile3D outTile,
+                                                             xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                             const xai_pTile4D coeffTile,
+                                                             const xai_pArray biasArray,
+                                                             xai_pTile3D outTile,
+                                                             xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                             const xai_pTile4D coeffTile,
+                                                             const xai_pArray biasArray,
+                                                             xai_pTile3D outTile,
+                                                             xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_5x5_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                             const xai_pTile4D coeffTile,
+                                                             const xai_pArray biasArray,
+                                                             xai_pTile3D outTile,
+                                                             xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_7x7_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                             const xai_pTile4D coeffTile,
+                                                             const xai_pArray biasArray,
+                                                             xai_pTile3D outTile,
+                                                             xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxN_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                             const xai_pTile4D coeffTile,
+                                                             const xai_pArray biasArray,
+                                                             xai_pTile3D outTile,
+                                                             xai_cnn_conv_params *param);
+
+/* Convolution SO */
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxN_S8S8IX_SO_DWH(const xai_pTile3D inTile,
+                                                         const xai_pTile4D coeffTile,
+                                                         const xai_pArray biasArray,
+                                                         xai_pTile3D outTile,
+                                                         xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxN_U8S8IX_SO_DWH(const xai_pTile3D inTile,
+                                                         const xai_pTile4D coeffTile,
+                                                         const xai_pArray biasArray,
+                                                         xai_pTile3D outTile,
+                                                         xai_cnn_conv_params *param);
+
+/* Convolution Fully connected */
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3D(const xai_pTile3D inTile,
+                                           const xai_pTile4D coeffTile,
+                                           const xai_pArray biasArray,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3D_S_S8S8IX(const xai_pTile3D inTile,
+                                                    const xai_pTile4D coeffTile,
+                                                    const xai_pArray biasArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3D_S_U8S8IX(const xai_pTile3D inTile,
+                                                    const xai_pTile4D coeffTile,
+                                                    const xai_pArray biasArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3D_S_S16S16I16(const xai_pTile3D inTile,
+                                                       const xai_pTile4D coeffTile,
+                                                       const xai_pArray biasArray,
+                                                       xai_pTile3D outTile,
+                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnected3D(const xai_pTile3D inTile,
+                                                  const xai_pTile4D coeffTile,
+                                                  const xai_pArray biasArray,
+                                                  xai_pTile3D accArray,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnected3D_S_S8S8IXCa2(const xai_pTile3D inTile,
+                                                              const xai_pTile4D coeffTile,
+                                                              const xai_pArray biasArray,
+                                                              xai_pTile3D accTile,
+                                                              xai_pTile3D outTile,
+                                                              const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnected3D_S_U8S8IXCa2(const xai_pTile3D inTile,
+                                                              const xai_pTile4D coeffTile,
+                                                              const xai_pArray biasArray,
+                                                              xai_pTile3D accTile,
+                                                              xai_pTile3D outTile,
+                                                              const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnected3D_S_S16S16I16Ca2(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 xai_pTile3D accTile,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnected3D_S_S8S8IXCa2_QM32(const xai_pTile3D inTile,
+                                                                   const xai_pTile4D coeffTile,
+                                                                   const xai_pArray biasArray,
+                                                                   xai_pTile3D accTile,
+                                                                   xai_pTile3D outTile,
+                                                                   const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnected3D_S_U8S8IXCa2_QM32(const xai_pTile3D inTile,
+                                                                   const xai_pTile4D coeffTile,
+                                                                   const xai_pArray biasArray,
+                                                                   xai_pTile3D accTile,
+                                                                   xai_pTile3D outTile,
+                                                                   const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching(const xai_pTile4D inTile,
+                                                       const xai_pTile4D coeffTile,
+                                                       const xai_pArray biasArray,
+                                                       xai_pArray accArray,
+                                                       xai_pTile4D outTile,
+                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_S8S8IXCa2(const xai_pTile4D inTile,
+                                                                   const xai_pTile4D coeffTile,
+                                                                   const xai_pArray biasArray,
+                                                                   xai_pArray accArray,
+                                                                   xai_pTile4D outTile,
+                                                                   const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_U8S8IXCa2(const xai_pTile4D inTile,
+                                                                   const xai_pTile4D coeffTile,
+                                                                   const xai_pArray biasArray,
+                                                                   xai_pArray accArray,
+                                                                   xai_pTile4D outTile,
+                                                                   const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_S8U8IXCa2(const xai_pTile4D inTile,
+                                                                   const xai_pTile4D coeffTile,
+                                                                   const xai_pArray biasArray,
+                                                                   xai_pArray accArray,
+                                                                   xai_pTile4D outTile,
+                                                                   const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_U8U8IXCa2(const xai_pTile4D inTile,
+                                                                   const xai_pTile4D coeffTile,
+                                                                   const xai_pArray biasArray,
+                                                                   xai_pArray accArray,
+                                                                   xai_pTile4D outTile,
+                                                                   const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_U8S8IXCa2_NoBU(const xai_pTile4D inTile,
+                                                                        const xai_pTile4D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        xai_pArray accArray,
+                                                                        xai_pTile4D outTile,
+                                                                        const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_S16S16I16(const xai_pTile4D inTile,
+                                                                   const xai_pTile4D coeffTile,
+                                                                   const xai_pArray biasArray,
+                                                                   xai_pArray accArray,
+                                                                   xai_pTile4D outTile,
+                                                                   const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_U16S16I16(const xai_pTile4D inTile,
+                                                                   const xai_pTile4D coeffTile,
+                                                                   const xai_pArray biasArray,
+                                                                   xai_pArray accArray,
+                                                                   xai_pTile4D outTile,
+                                                                   const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_S16U16I16(const xai_pTile4D inTile,
+                                                                   const xai_pTile4D coeffTile,
+                                                                   const xai_pArray biasArray,
+                                                                   xai_pArray accArray,
+                                                                   xai_pTile4D outTile,
+                                                                   const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3D2_S_S8(const xai_pTile3D inTile,
+                                                 const xai_pTile4D coeffTile,
+                                                 const xai_pArray biasArray,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3D2_S_U8S8U8(const xai_pTile3D inTile,
+                                                     const xai_pTile4D coeffTile,
+                                                     const xai_pArray biasArray,
+                                                     xai_pTile3D outTile,
+                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3D2_S_U8(const xai_pTile3D inTile,
+                                                 const xai_pTile4D coeffTile,
+                                                 const xai_pArray biasArray,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_conv_params *param);
+
+/* Dilated Convolution wrapper function */
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D(const xai_pTile3D inTile,
+                                      const xai_pTile4D coeffTile,
+                                      const xai_pArray biasArray,
+                                      xai_pTile3D outTile,
+                                      const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE *xaiGetConvolved3DVariant(const xai_pTile3D inTile,
+                                                 const xai_pTile4D coeffTile,
+                                                 const xai_pArray biasArray,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_conv_params *param);
+
+/* Dilated Convolution MOW, dilation = 1 */
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_2x2j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_2x2j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_4x4j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_4x4j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD(const xai_pTile3D inTile,
+                                                                  const xai_pTile4D coeffTile,
+                                                                  const xai_pArray biasArray,
+                                                                  xai_pTile3D outTile,
+                                                                  const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1j2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1j2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD(const xai_pTile3D inTile,
+                                                                  const xai_pTile4D coeffTile,
+                                                                  const xai_pArray biasArray,
+                                                                  xai_pTile3D outTile,
+                                                                  const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1j4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1j4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD(const xai_pTile3D inTile,
+                                                                  const xai_pTile4D coeffTile,
+                                                                  const xai_pArray biasArray,
+                                                                  xai_pTile3D outTile,
+                                                                  const xai_cnn_conv_params *param);
+
+/* Dilated Convolution MOW, dilation = 2*/
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+/* Dilated Convolution MOW, dilation = 4 */
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                               const xai_pTile4D coeffTile,
+                                                               const xai_pArray biasArray,
+                                                               xai_pTile3D outTile,
+                                                               const xai_cnn_conv_params *param);
+
+/* Dilated Convolution MOD*/
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                  const xai_pTile4D coeffTile,
+                                                                  const xai_pArray biasArray,
+                                                                  xai_pTile3D outTile,
+                                                                  const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_2x2_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                  const xai_pTile4D coeffTile,
+                                                                  const xai_pArray biasArray,
+                                                                  xai_pTile3D outTile,
+                                                                  const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                  const xai_pTile4D coeffTile,
+                                                                  const xai_pArray biasArray,
+                                                                  xai_pTile3D outTile,
+                                                                  const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_4x4_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                  const xai_pTile4D coeffTile,
+                                                                  const xai_pArray biasArray,
+                                                                  xai_pTile3D outTile,
+                                                                  const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                  const xai_pTile4D coeffTile,
+                                                                  const xai_pArray biasArray,
+                                                                  xai_pTile3D outTile,
+                                                                  const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                  const xai_pTile4D coeffTile,
+                                                                  const xai_pArray biasArray,
+                                                                  xai_pTile3D outTile,
+                                                                  const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                  const xai_pTile4D coeffTile,
+                                                                  const xai_pArray biasArray,
+                                                                  xai_pTile3D outTile,
+                                                                  const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                              const xai_pTile4D coeffTile,
+                                                              const xai_pArray biasArray,
+                                                              xai_pTile3D outTile,
+                                                              const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_2x2_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                              const xai_pTile4D coeffTile,
+                                                              const xai_pArray biasArray,
+                                                              xai_pTile3D outTile,
+                                                              const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                              const xai_pTile4D coeffTile,
+                                                              const xai_pArray biasArray,
+                                                              xai_pTile3D outTile,
+                                                              const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_4x4_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                              const xai_pTile4D coeffTile,
+                                                              const xai_pArray biasArray,
+                                                              xai_pTile3D outTile,
+                                                              const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                              const xai_pTile4D coeffTile,
+                                                              const xai_pArray biasArray,
+                                                              xai_pTile3D outTile,
+                                                              const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                              const xai_pTile4D coeffTile,
+                                                              const xai_pArray biasArray,
+                                                              xai_pTile3D outTile,
+                                                              const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                              const xai_pTile4D coeffTile,
+                                                              const xai_pArray biasArray,
+                                                              xai_pTile3D outTile,
+                                                              const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxN_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                              const xai_pTile4D coeffTile,
+                                                              const xai_pArray biasArray,
+                                                              xai_pTile3D outTile,
+                                                              const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile,
+                                                              const xai_pTile4D coeffTile,
+                                                              const xai_pArray biasArray,
+                                                              xai_pTile3D outTile,
+                                                              const xai_cnn_conv_params *param);
+
+/* Partial convolution */
+_XAI_API_ XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                     const xai_pTile4D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D accTile,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                     const xai_pTile4D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D accTile,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH_QM32(const xai_pTile3D inTile,
+                                                                          const xai_pTile4D coeffTile,
+                                                                          const xai_pArray biasArray,
+                                                                          xai_pTile3D accTile,
+                                                                          xai_pTile3D outTile,
+                                                                          const xai_cnn_conv_params *param);
+
+/* MOD_DWH S16S16 Partial Convolution variant */
+_XAI_API_ XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile,
+                                                                     const xai_pTile4D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D accTile,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+/* Dilated Convolution SO*/
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxN_S8S8IX_SO_DWH(const xai_pTile3D inTile,
+                                                          const xai_pTile4D coeffTile,
+                                                          const xai_pArray biasArray,
+                                                          xai_pTile3D outTile,
+                                                          const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxN_U8S8IX_SO_DWH(const xai_pTile3D inTile,
+                                                          const xai_pTile4D coeffTile,
+                                                          const xai_pArray biasArray,
+                                                          xai_pTile3D outTile,
+                                                          const xai_cnn_conv_params *param);
+
+/* Depthwise Convolution wrappper function */
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D(const xai_pTile3D inTile,
+                                              const xai_pTile3D coeffTile,
+                                              const xai_pArray biasArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE *xaiGetDepthwiseConvolve2DVariant(const xai_pTile3D inTile,
+                                                         const xai_pTile3D coeffTile,
+                                                         const xai_pArray biasArray,
+                                                         xai_pTile3D outTile,
+                                                         const xai_cnn_conv_params *param);
+
+/* Depthwise Convolutions MOW */
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_7x7j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_7x7j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_7x7j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_7x7j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_7x7j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_7x7j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+/* Depthwise MOW Convolution MOW 16-bit Variants */
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj1_S16S16I16_MOW_WHD(const xai_pTile3D inTile,
+                                                                        const xai_pTile3D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj2_S16S16I16_MOW_WHD(const xai_pTile3D inTile,
+                                                                        const xai_pTile3D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj4_S16S16I16_MOW_WHD(const xai_pTile3D inTile,
+                                                                        const xai_pTile3D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_conv_params *param);
+/* Depthwise MOW Convolution VQ variants*/
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_3x3j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_3x3j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_5x5j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_5x5j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_7x7j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_7x7j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_3x3j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_3x3j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_5x5j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_5x5j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_7x7j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_7x7j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_3x3j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_3x3j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_5x5j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_5x5j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_7x7j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_7x7j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+/* Depthwise MOW Convolution MOW 16-bit Variants */
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj1_S16S16I16_MOW_WHD(const xai_pTile3D inTile,
+                                                                          const xai_pTile3D coeffTile,
+                                                                          const xai_pArray biasArray,
+                                                                          const xai_pArray outputScaleArray,
+                                                                          xai_pTile3D outTile,
+                                                                          const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj2_S16S16I16_MOW_WHD(const xai_pTile3D inTile,
+                                                                          const xai_pTile3D coeffTile,
+                                                                          const xai_pArray biasArray,
+                                                                          const xai_pArray outputScaleArray,
+                                                                          xai_pTile3D outTile,
+                                                                          const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj4_S16S16I16_MOW_WHD(const xai_pTile3D inTile,
+                                                                          const xai_pTile3D coeffTile,
+                                                                          const xai_pArray biasArray,
+                                                                          const xai_pArray outputScaleArray,
+                                                                          xai_pTile3D outTile,
+                                                                          const xai_cnn_conv_params *param);
+
+/* Depthwise Convolutions MOD */
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                      const xai_pTile3D coeffTile,
+                                                                      const xai_pArray biasArray,
+                                                                      xai_pTile3D outTile,
+                                                                      const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                      const xai_pTile3D coeffTile,
+                                                                      const xai_pArray biasArray,
+                                                                      xai_pTile3D outTile,
+                                                                      const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                      const xai_pTile3D coeffTile,
+                                                                      const xai_pArray biasArray,
+                                                                      xai_pTile3D outTile,
+                                                                      const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                      const xai_pTile3D coeffTile,
+                                                                      const xai_pArray biasArray,
+                                                                      xai_pTile3D outTile,
+                                                                      const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_7x7_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                      const xai_pTile3D coeffTile,
+                                                                      const xai_pArray biasArray,
+                                                                      xai_pTile3D outTile,
+                                                                      const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxN_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                      const xai_pTile3D coeffTile,
+                                                                      const xai_pArray biasArray,
+                                                                      xai_pTile3D outTile,
+                                                                      const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxN_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                      const xai_pTile3D coeffTile,
+                                                                      const xai_pArray biasArray,
+                                                                      xai_pTile3D outTile,
+                                                                      const xai_cnn_conv_params *param);
+/* Depthwise MOD16 Variants */
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D pinTile,
+                                                                      const xai_pTile3D pcoeffTile,
+                                                                      const xai_pArray pbiasArray,
+                                                                      xai_pTile3D poutTile,
+                                                                      const xai_cnn_conv_params *pconvParam);
+
+/* Depthwise MOD VQ Convolution variants */
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_3x3_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                        const xai_pTile3D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        const xai_pArray outputScaleArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_3x3_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                        const xai_pTile3D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        const xai_pArray outputScaleArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_5x5_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                        const xai_pTile3D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        const xai_pArray outputScaleArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_5x5_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                        const xai_pTile3D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        const xai_pArray outputScaleArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_7x7_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                        const xai_pTile3D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        const xai_pArray outputScaleArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxN_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                        const xai_pTile3D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        const xai_pArray outputScaleArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxN_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                        const xai_pTile3D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        const xai_pArray outputScaleArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_conv_params *param);
+/* Depthwise MOD16 Variants */
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D pinTile,
+                                                                        const xai_pTile3D pcoeffTile,
+                                                                        const xai_pArray pbiasArray,
+                                                                        const xai_pArray poutputScaleArray,
+                                                                        xai_pTile3D poutTile,
+                                                                        const xai_cnn_conv_params *pconvParam);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D(const xai_pTile3D inTile,
+                                                const xai_pTile3D coeffTile,
+                                                const xai_pArray biasArray,
+                                                const xai_pArray outputScaleArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE *xaiGetDepthwiseConvolveVQ2DVariant(const xai_pTile3D inTile,
+                                                           const xai_pTile3D coeffTile,
+                                                           const xai_pArray biasArray,
+                                                           const xai_pArray outputScaleArray,
+                                                           xai_pTile3D outTile,
+                                                           const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D(const xai_pTile3D inTile,
+                                                 const xai_pTile3D coeffTile,
+                                                 const xai_pArray biasArray,
+                                                 const xai_pArray outputScaleArray,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE *xaiGetDepthwiseConvolvedVQ2DVariant(const xai_pTile3D inTile,
+                                                            const xai_pTile3D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            const xai_pArray outputScaleArray,
+                                                            xai_pTile3D outTile,
+                                                            const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_MxNj1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                          const xai_pTile3D coeffTile,
+                                                                          const xai_pArray biasArray,
+                                                                          const xai_pArray outputScaleArray,
+                                                                          xai_pTile3D outTile,
+                                                                          const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_MxNj1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                          const xai_pTile3D coeffTile,
+                                                                          const xai_pArray biasArray,
+                                                                          const xai_pArray outputScaleArray,
+                                                                          xai_pTile3D outTile,
+                                                                          const xai_cnn_depthwiseDilatedConv_params *param);
+
+
+/*Depthwise dilated wrapper function*/
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D(const xai_pTile3D inTile,
+                                               const xai_pTile3D coeffTile,
+                                               const xai_pArray biasArray,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_MxNj1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                          const xai_pTile3D coeffTile,
+                                                                          const xai_pArray biasArray,
+                                                                          const xai_pArray outputScaleArray,
+                                                                          xai_pTile3D outTile,
+                                                                          const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_MxNj1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                          const xai_pTile3D coeffTile,
+                                                                          const xai_pArray biasArray,
+                                                                          const xai_pArray outputScaleArray,
+                                                                          xai_pTile3D outTile,
+                                                                          const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE *xaiGetDepthwiseConvolved2DVariant(const xai_pTile3D inTile,
+                                                          const xai_pTile3D coeffTile,
+                                                          const xai_pArray biasArray,
+                                                          xai_pTile3D outTile,
+                                                          const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_MxNj1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                        const xai_pTile3D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_MxNj1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                        const xai_pTile3D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_depthwiseDilatedConv_params *param);
+
+/*_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_3x3j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_3x3j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_depthwiseDilatedConv_params *param);
+ */
+/*Depthwise dilated MOW convolution variants*/
+/*_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_3x3j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_3x3j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_5x5j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_5x5j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_7x7j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_7x7j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_depthwiseDilatedConv_params *param);
+ */
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_MxNj1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                        const xai_pTile3D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_MxNj1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                        const xai_pTile3D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_depthwiseDilatedConv_params *param);
+
+/*
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_5x5j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_5x5j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_7x7j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_7x7j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                     const xai_pTile3D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_depthwiseDilatedConv_params *param);
+
+ */
+/*Depthwise Dilated MOD convolution variants*/
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_3x3_S8S8IX_MOD_DWH(const xai_pTile3D inTile,
+                                                                    const xai_pTile3D coeffTile,
+                                                                    const xai_pArray biasArray,
+                                                                    xai_pTile3D outTile,
+                                                                    const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_5x5_S8S8IX_MOD_DWH(const xai_pTile3D inTile,
+                                                                    const xai_pTile3D coeffTile,
+                                                                    const xai_pArray biasArray,
+                                                                    xai_pTile3D outTile,
+                                                                    const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_7x7_S8S8IX_MOD_DWH(const xai_pTile3D inTile,
+                                                                    const xai_pTile3D coeffTile,
+                                                                    const xai_pArray biasArray,
+                                                                    xai_pTile3D outTile,
+                                                                    const xai_cnn_depthwiseDilatedConv_params *param);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_MxN_S8S8IX_MOD_DWH(const xai_pTile3D inTile,
+                                                                    const xai_pTile3D coeffTile,
+                                                                    const xai_pArray biasArray,
+                                                                    xai_pTile3D outTile,
+                                                                    const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_MxN_U8S8IX_MOD_DWH(const xai_pTile3D inTile,
+                                                                    const xai_pTile3D coeffTile,
+                                                                    const xai_pArray biasArray,
+                                                                    xai_pTile3D outTile,
+                                                                    const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_depthwiseDilatedConv_params *param);
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_5x5_S16S16I16_MOD_DWH(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_depthwiseDilatedConv_params *param);
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_3x3_S16S16I16_MOD_DWH(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_depthwiseDilatedConv_params *param);
+
+/*
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_3x3j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_3x3j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_5x5j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_5x5j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_7x7j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_7x7j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_3x3j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_3x3j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_5x5j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_5x5j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_7x7j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_depthwiseDilatedConv_params *param);
+
+   _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_7x7j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                       const xai_pTile3D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_depthwiseDilatedConv_params *param);
+ */
+
+
+/* VQ variants */
+/*Depthwise Dilated MOD convolution variants*/
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_3x3_S8S8IX_MOD_DWH(const xai_pTile3D inTile,
+                                                                      const xai_pTile3D coeffTile,
+                                                                      const xai_pArray biasArray,
+                                                                      const xai_pArray outputScaleArray,
+                                                                      xai_pTile3D outTile,
+                                                                      const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_5x5_S8S8IX_MOD_DWH(const xai_pTile3D inTile,
+                                                                      const xai_pTile3D coeffTile,
+                                                                      const xai_pArray biasArray,
+                                                                      const xai_pArray outputScaleArray,
+                                                                      xai_pTile3D outTile,
+                                                                      const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_7x7_S8S8IX_MOD_DWH(const xai_pTile3D inTile,
+                                                                      const xai_pTile3D coeffTile,
+                                                                      const xai_pArray biasArray,
+                                                                      const xai_pArray outputScaleArray,
+                                                                      xai_pTile3D outTile,
+                                                                      const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_MxN_S8S8IX_MOD_DWH(const xai_pTile3D inTile,
+                                                                      const xai_pTile3D coeffTile,
+                                                                      const xai_pArray biasArray,
+                                                                      const xai_pArray outputScaleArray,
+                                                                      xai_pTile3D outTile,
+                                                                      const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_MxN_U8S8IX_MOD_DWH(const xai_pTile3D inTile,
+                                                                      const xai_pTile3D coeffTile,
+                                                                      const xai_pArray biasArray,
+                                                                      const xai_pArray outputScaleArray,
+                                                                      xai_pTile3D outTile,
+                                                                      const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile,
+                                                                         const xai_pTile3D coeffTile,
+                                                                         const xai_pArray biasArray,
+                                                                         const xai_pArray outputScaleArray,
+                                                                         xai_pTile3D outTile,
+                                                                         const xai_cnn_depthwiseDilatedConv_params *param);
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_3x3_S16S16I16_MOD_DWH(const xai_pTile3D inTile,
+                                                                         const xai_pTile3D coeffTile,
+                                                                         const xai_pArray biasArray,
+                                                                         const xai_pArray outputScaleArray,
+                                                                         xai_pTile3D outTile,
+                                                                         const xai_cnn_depthwiseDilatedConv_params *param);
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_5x5_S16S16I16_MOD_DWH(const xai_pTile3D inTile,
+                                                                         const xai_pTile3D coeffTile,
+                                                                         const xai_pArray biasArray,
+                                                                         const xai_pArray outputScaleArray,
+                                                                         xai_pTile3D outTile,
+                                                                         const xai_cnn_depthwiseDilatedConv_params *param);
+
+/* Depthwise DM MOD convolve */
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseDMConvolved2D_S_MxN_S8S8IX_MOD_DWH(const xai_pTile3D inTile,
+                                                                      const xai_pTile3D coeffTile,
+                                                                      const xai_pArray biasArray,
+                                                                      xai_pTile3D outTile,
+                                                                      const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseDMConvolved2D_S_MxN_U8S8IX_MOD_DWH(const xai_pTile3D inTile,
+                                                                      const xai_pTile3D coeffTile,
+                                                                      const xai_pArray biasArray,
+                                                                      xai_pTile3D outTile,
+                                                                      const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseDMConvolved2D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile,
+                                                                         const xai_pTile3D coeffTile,
+                                                                         const xai_pArray biasArray,
+                                                                         xai_pTile3D outTile,
+                                                                         const xai_cnn_depthwiseDilatedConv_params *param);
+
+/* Depthwise DM MOD convole VQ */
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseDMConvolvedVQ2D_S_MxN_S8S8IX_MOD_DWH(const xai_pTile3D inTile,
+                                                                        const xai_pTile3D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        const xai_pArray outputScaleArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseDMConvolvedVQ2D_S_MxN_U8S8IX_MOD_DWH(const xai_pTile3D inTile,
+                                                                        const xai_pTile3D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        const xai_pArray outputScaleArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseDMConvolvedVQ2D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile,
+                                                                           const xai_pTile3D coeffTile,
+                                                                           const xai_pArray biasArray,
+                                                                           const xai_pArray outputScaleArray,
+                                                                           xai_pTile3D outTile,
+                                                                           const xai_cnn_depthwiseDilatedConv_params *param);
+
+/*_XAI_API_ XAI_ERR_TYPE xaiDepthwiseDMConvolvedReorderCoeff2D_MOD(const xai_pTile3D srcTile, const xai_pArray biasArray,
+                                                                xai_pTile3D dstTile, xai_pArray biasArrayReOrder,
+                                                                const int32_t inDepth, const int32_t depthMultiplier);
+ */
+
+/* VQ variants */
+
+/* Dilated convolution wrapper */
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D(const xai_pTile3D inTile,
+                                        const xai_pTile4D coeffTile,
+                                        const xai_pArray biasArray,
+                                        const xai_pArray outputScaleArray,
+                                        xai_pTile3D outTile,
+                                        const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE *xaiGetConvolvedVQ3DVariant(const xai_pTile3D inTile,
+                                                   const xai_pTile4D coeffTile,
+                                                   const xai_pArray biasArray,
+                                                   const xai_pArray outputScaleArray,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+/* Dilated MOD VQ Convolution variants */
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                    const xai_pTile4D coeffTile,
+                                                                    const xai_pArray biasArray,
+                                                                    const xai_pArray outputScaleArray,
+                                                                    xai_pTile3D outTile,
+                                                                    const xai_cnn_conv_params *param);
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_2x2_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                    const xai_pTile4D coeffTile,
+                                                                    const xai_pArray biasArray,
+                                                                    const xai_pArray outputScaleArray,
+                                                                    xai_pTile3D outTile,
+                                                                    const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                    const xai_pTile4D coeffTile,
+                                                                    const xai_pArray biasArray,
+                                                                    const xai_pArray outputScaleArray,
+                                                                    xai_pTile3D outTile,
+                                                                    const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_4x4_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                    const xai_pTile4D coeffTile,
+                                                                    const xai_pArray biasArray,
+                                                                    const xai_pArray outputScaleArray,
+                                                                    xai_pTile3D outTile,
+                                                                    const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                    const xai_pTile4D coeffTile,
+                                                                    const xai_pArray biasArray,
+                                                                    const xai_pArray outputScaleArray,
+                                                                    xai_pTile3D outTile,
+                                                                    const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                    const xai_pTile4D coeffTile,
+                                                                    const xai_pArray biasArray,
+                                                                    const xai_pArray outputScaleArray,
+                                                                    xai_pTile3D outTile,
+                                                                    const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                                    const xai_pTile4D coeffTile,
+                                                                    const xai_pArray biasArray,
+                                                                    const xai_pArray outputScaleArray,
+                                                                    xai_pTile3D outTile,
+                                                                    const xai_cnn_conv_params *param);
+
+/* Dilated Convolution MOW, dilation = 1 */
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_2x2j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_2x2j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_4x4j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_4x4j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD(const xai_pTile3D inTile,
+                                                                    const xai_pTile4D coeffTile,
+                                                                    const xai_pArray biasArray,
+                                                                    const xai_pArray outputScaleArray,
+                                                                    xai_pTile3D outTile,
+                                                                    const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1j2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1j2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD(const xai_pTile3D inTile,
+                                                                    const xai_pTile4D coeffTile,
+                                                                    const xai_pArray biasArray,
+                                                                    const xai_pArray outputScaleArray,
+                                                                    xai_pTile3D outTile,
+                                                                    const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1j4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1j4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD(const xai_pTile3D inTile,
+                                                                    const xai_pTile4D coeffTile,
+                                                                    const xai_pArray biasArray,
+                                                                    const xai_pArray outputScaleArray,
+                                                                    xai_pTile3D outTile,
+                                                                    const xai_cnn_conv_params *param);
+
+/* Dilated Convolutions MOW, dilation = 2*/
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+/* Dilated Convolutions MOW, dilation = 4 */
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile,
+                                                                 const xai_pTile4D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 const xai_pArray outputScaleArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_conv_params *param);
+
+/* Dilated Convolution MOD_DWH - VQ variants */
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                const xai_pTile4D coeffTile,
+                                                                const xai_pArray biasArray,
+                                                                const xai_pArray outputScaleArray,
+                                                                xai_pTile3D outTile,
+                                                                const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_2x2_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                const xai_pTile4D coeffTile,
+                                                                const xai_pArray biasArray,
+                                                                const xai_pArray outputScaleArray,
+                                                                xai_pTile3D outTile,
+                                                                const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                const xai_pTile4D coeffTile,
+                                                                const xai_pArray biasArray,
+                                                                const xai_pArray outputScaleArray,
+                                                                xai_pTile3D outTile,
+                                                                const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_4x4_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                const xai_pTile4D coeffTile,
+                                                                const xai_pArray biasArray,
+                                                                const xai_pArray outputScaleArray,
+                                                                xai_pTile3D outTile,
+                                                                const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                const xai_pTile4D coeffTile,
+                                                                const xai_pArray biasArray,
+                                                                const xai_pArray outputScaleArray,
+                                                                xai_pTile3D outTile,
+                                                                const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                const xai_pTile4D coeffTile,
+                                                                const xai_pArray biasArray,
+                                                                const xai_pArray outputScaleArray,
+                                                                xai_pTile3D outTile,
+                                                                const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                const xai_pTile4D coeffTile,
+                                                                const xai_pArray biasArray,
+                                                                const xai_pArray outputScaleArray,
+                                                                xai_pTile3D outTile,
+                                                                const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile,
+                                                                          const xai_pTile4D coeffTile,
+                                                                          const xai_pArray biasArray,
+                                                                          const xai_pArray outputScaleArray,
+                                                                          xai_pTile3D outTile,
+                                                                          const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile,
+                                                                        const xai_pTile4D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                const xai_pTile4D coeffTile,
+                                                                const xai_pArray biasArray,
+                                                                const xai_pArray outputScaleArray,
+                                                                xai_pTile3D outTile,
+                                                                const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                              const xai_pTile4D coeffTile,
+                                                              const xai_pArray biasArray,
+                                                              xai_pTile3D outTile,
+                                                              const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                const xai_pTile4D coeffTile,
+                                                                const xai_pArray biasArray,
+                                                                const xai_pArray outputScaleArray,
+                                                                xai_pTile3D outTile,
+                                                                const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                              const xai_pTile4D coeffTile,
+                                                              const xai_pArray biasArray,
+                                                              xai_pTile3D outTile,
+                                                              const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                const xai_pTile4D coeffTile,
+                                                                const xai_pArray biasArray,
+                                                                const xai_pArray outputScaleArray,
+                                                                xai_pTile3D outTile,
+                                                                const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile,
+                                                                          const xai_pTile4D coeffTile,
+                                                                          const xai_pArray biasArray,
+                                                                          const xai_pArray outputScaleArray,
+                                                                          xai_pTile3D outTile,
+                                                                          const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile,
+                                                                        const xai_pTile4D coeffTile,
+                                                                        const xai_pArray biasArray,
+                                                                        xai_pTile3D outTile,
+                                                                        const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile,
+                                                                const xai_pTile4D coeffTile,
+                                                                const xai_pArray biasArray,
+                                                                const xai_pArray outputScaleArray,
+                                                                xai_pTile3D outTile,
+                                                                const xai_cnn_conv_params *param);
+
+/*Bias update function*/
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedBiasUpdate_S8S32(const xai_pTile4D coeffTile,
+                                                    xai_pArray biasArray);
+
+/* Reorder a 4D tile to IN64DWH format */
+_XAI_API_ XAI_ERR_TYPE xaiReOrder4DToIN64DWH_I8(xai_pTile4D coeffTileIn, xai_pTile4D coeffTileOut);
+
+/* Reorder a 4D tile to IN32DWH format */
+_XAI_API_ XAI_ERR_TYPE xaiReOrder4DToIN32DWH_I16(xai_pTile4D coeffTileIn, xai_pTile4D coeffTileOut);
+
+/* Partial convolution */
+_XAI_API_ XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                       const xai_pTile4D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D accTile,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile,
+                                                                                 const xai_pTile4D coeffTile,
+                                                                                 const xai_pArray biasArray,
+                                                                                 const xai_pArray outputScaleArray,
+                                                                                 xai_pTile3D accTile,
+                                                                                 xai_pTile3D outTile,
+                                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile,
+                                                                               const xai_pTile4D coeffTile,
+                                                                               const xai_pArray biasArray,
+                                                                               xai_pTile3D accTile,
+                                                                               xai_pTile3D outTile,
+                                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile,
+                                                                       const xai_pTile4D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D accTile,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile,
+                                                                                 const xai_pTile4D coeffTile,
+                                                                                 const xai_pArray biasArray,
+                                                                                 const xai_pArray outputScaleArray,
+                                                                                 xai_pTile3D accTile,
+                                                                                 xai_pTile3D outTile,
+                                                                                 const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile,
+                                                                               const xai_pTile4D coeffTile,
+                                                                               const xai_pArray biasArray,
+                                                                               xai_pTile3D accTile,
+                                                                               xai_pTile3D outTile,
+                                                                               const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_QM32(const xai_pTile3D inTile,
+                                                                            const xai_pTile4D coeffTile,
+                                                                            const xai_pArray biasArray,
+                                                                            const xai_pArray outputScaleArray,
+                                                                            xai_pTile3D accTile,
+                                                                            xai_pTile3D outTile,
+                                                                            const xai_cnn_conv_params *param);
+
+/* MOD_DWH S16S16 Partial Convolution VQ variant */
+_XAI_API_ XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile,
+                                                                       const xai_pTile4D coeffTile,
+                                                                       const xai_pArray biasArray,
+                                                                       const xai_pArray outputScaleArray,
+                                                                       xai_pTile3D accTile,
+                                                                       xai_pTile3D outTile,
+                                                                       const xai_cnn_conv_params *param);
+
+/* MxN SO VQ variants */
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S8S8IX_SO_DWH(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            const xai_pArray outputScaleArray,
+                                                            xai_pTile3D outTile,
+                                                            const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_U8S8IX_SO_DWH(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            const xai_pArray outputScaleArray,
+                                                            xai_pTile3D outTile,
+                                                            const xai_cnn_conv_params *param);
+
+
+/* MxN Fully Connected VQ variants */
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3D(const xai_pTile3D inTile,
+                                             const xai_pTile4D coeffTile,
+                                             const xai_pArray biasArray,
+                                             const xai_pArray outputScaleArray,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3D_S_S8S8IX(const xai_pTile3D inTile,
+                                                      const xai_pTile4D coeffTile,
+                                                      const xai_pArray biasArray,
+                                                      const xai_pArray outputScaleArray,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3D_S_U8S8IX(const xai_pTile3D inTile,
+                                                      const xai_pTile4D coeffTile,
+                                                      const xai_pArray biasArray,
+                                                      const xai_pArray outputScaleArray,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3D_S_S16S16I16(const xai_pTile3D inTile,
+                                                         const xai_pTile4D coeffTile,
+                                                         const xai_pArray biasArray,
+                                                         const xai_pArray outputScaleArray,
+                                                         xai_pTile3D outTile,
+                                                         const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnectedVQ3D(const xai_pTile3D inTile,
+                                                    const xai_pTile4D coeffTile,
+                                                    const xai_pArray biasArray,
+                                                    const xai_pArray outputScaleArray,
+                                                    xai_pTile3D accTile,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnectedVQ3D_S_S8S8IXCa2(const xai_pTile3D inTile,
+                                                                const xai_pTile4D coeffTile,
+                                                                const xai_pArray biasArray,
+                                                                const xai_pArray outputScaleArray,
+                                                                xai_pTile3D accTile,
+                                                                xai_pTile3D outTile,
+                                                                const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnectedVQ3D_S_U8S8IXCa2(const xai_pTile3D inTile,
+                                                                const xai_pTile4D coeffTile,
+                                                                const xai_pArray biasArray,
+                                                                const xai_pArray outputScaleArray,
+                                                                xai_pTile3D accTile,
+                                                                xai_pTile3D outTile,
+                                                                const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnectedVQ3D_S_S16S16I16Ca2(const xai_pTile3D inTile,
+                                                                   const xai_pTile4D coeffTile,
+                                                                   const xai_pArray biasArray,
+                                                                   const xai_pArray outputScaleArray,
+                                                                   xai_pTile3D accTile,
+                                                                   xai_pTile3D outTile,
+                                                                   const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnectedVQ3D_S_S8S8IXCa2_QM32(const xai_pTile3D inTile,
+                                                                     const xai_pTile4D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     const xai_pArray outputScaleArray,
+                                                                     xai_pTile3D accTile,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnectedVQ3D_S_U8S8IXCa2_QM32(const xai_pTile3D inTile,
+                                                                     const xai_pTile4D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     const xai_pArray outputScaleArray,
+                                                                     xai_pTile3D accTile,
+                                                                     xai_pTile3D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching(const xai_pTile4D inTile,
+                                                         const xai_pTile4D coeffTile,
+                                                         const xai_pArray biasArray,
+                                                         xai_pArray accArray,
+                                                         const xai_pArray outputScaleArray,
+                                                         xai_pTile4D outTile,
+                                                         const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching_S_S8S8IXCa2(const xai_pTile4D inTile,
+                                                                     const xai_pTile4D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pArray accArray,
+                                                                     const xai_pArray outputScaleArray,
+                                                                     xai_pTile4D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching_S_U8S8IXCa2(const xai_pTile4D inTile,
+                                                                     const xai_pTile4D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pArray accArray,
+                                                                     const xai_pArray outputScaleArray,
+                                                                     xai_pTile4D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching_S_S8U8IXCa2(const xai_pTile4D inTile,
+                                                                     const xai_pTile4D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pArray accArray,
+                                                                     const xai_pArray outputScaleArray,
+                                                                     xai_pTile4D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching_S_U8U8IXCa2(const xai_pTile4D inTile,
+                                                                     const xai_pTile4D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pArray accArray,
+                                                                     const xai_pArray outputScaleArray,
+                                                                     xai_pTile4D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching_S_U8S8IXCa2_NoBU(const xai_pTile4D inTile,
+                                                                          const xai_pTile4D coeffTile,
+                                                                          const xai_pArray biasArray,
+                                                                          xai_pArray accArray,
+                                                                          const xai_pArray outputScaleArray,
+                                                                          xai_pTile4D outTile,
+                                                                          const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching_S_S16S16I16(const xai_pTile4D inTile,
+                                                                     const xai_pTile4D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pArray accArray,
+                                                                     const xai_pArray outputScaleArray,
+                                                                     xai_pTile4D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching_S_U16S16I16(const xai_pTile4D inTile,
+                                                                     const xai_pTile4D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pArray accArray,
+                                                                     const xai_pArray outputScaleArray,
+                                                                     xai_pTile4D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching_S_S16U16I16(const xai_pTile4D inTile,
+                                                                     const xai_pTile4D coeffTile,
+                                                                     const xai_pArray biasArray,
+                                                                     xai_pArray accArray,
+                                                                     const xai_pArray outputScaleArray,
+                                                                     xai_pTile4D outTile,
+                                                                     const xai_cnn_conv_params *param);
+
+/* Max Pool */
+_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D(const xai_pTile3D inTile,
+                                    xai_pTile3D outTile,
+                                    const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxNj1_S8_WHD(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxNj1_U8_WHD(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxNj1_S16_WHD(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxNj2_S8_WHD(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxNj2_U8_WHD(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxNj2_S16_WHD(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxN_S8_WHD(const xai_pTile3D inTile,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxN_U8_WHD(const xai_pTile3D inTile,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxN_S16_WHD(const xai_pTile3D inTile,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxN_S8_DWH(const xai_pTile3D inTile,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxN_U8_DWH(const xai_pTile3D inTile,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxN_S16_DWH(const xai_pTile3D inTile,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_pooling_params *param);
+
+/* MaxPoolWithIdx Variants */
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D(const xai_pTile3D inTile,
+                                           xai_pTile3D outTile,
+                                           xai_pTile3D idxTile,
+                                           const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxNj1_S8_WHD(const xai_pTile3D inTile,
+                                                        xai_pTile3D outTile,
+                                                        xai_pTile3D idxTile,
+                                                        const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxNj2_S8_WHD(const xai_pTile3D inTile,
+                                                        xai_pTile3D outTile,
+                                                        xai_pTile3D idxTile,
+                                                        const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxN_S8_DWH(const xai_pTile3D inTile,
+                                                      xai_pTile3D outTile,
+                                                      xai_pTile3D idxTile,
+                                                      const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxNj1_U8_WHD(const xai_pTile3D inTile,
+                                                        xai_pTile3D outTile,
+                                                        xai_pTile3D idxTile,
+                                                        const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxNj2_U8_WHD(const xai_pTile3D inTile,
+                                                        xai_pTile3D outTile,
+                                                        xai_pTile3D idxTile,
+                                                        const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxN_U8_DWH(const xai_pTile3D inTile,
+                                                      xai_pTile3D outTile,
+                                                      xai_pTile3D idxTile,
+                                                      const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxNj1_S16_WHD(const xai_pTile3D inTile,
+                                                         xai_pTile3D outTile,
+                                                         xai_pTile3D idxTile,
+                                                         const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxNj2_S16_WHD(const xai_pTile3D inTile,
+                                                         xai_pTile3D outTile,
+                                                         xai_pTile3D idxTile,
+                                                         const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxN_S16_DWH(const xai_pTile3D inTile,
+                                                       xai_pTile3D outTile,
+                                                       xai_pTile3D idxTile,
+                                                       const xai_cnn_pooling_params *param);
+
+
+/* MaxUnPool Variants */
+
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D(const xai_pTile3D inTile,
+                                      const xai_pTile3D idxTile,
+                                      xai_pTile3D outTile,
+                                      const xai_cnn_pooling_params *param);
+/*
+   _XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxNj1_S8_WHD(const xai_pTile3D inTile,
+                                                const xai_pTile3D idxTile,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_pooling_params *param);
+ */
+_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxNj2_S8_WHD(const xai_pTile3D inTile,
+                                                   const xai_pTile3D idxTile,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxN_S8_DWH(const xai_pTile3D inTile,
+                                                 const xai_pTile3D idxTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param);
+/*
+   _XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxNj1_U8_WHD(const xai_pTile3D inTile,
+                                                const xai_pTile3D idxTile,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_pooling_params *param);
+ */
+_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxNj2_U8_WHD(const xai_pTile3D inTile,
+                                                   const xai_pTile3D idxTile,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxN_U8_DWH(const xai_pTile3D inTile,
+                                                 const xai_pTile3D idxTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param);
+
+/*
+   _XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxNj1_S16_WHD(const xai_pTile3D inTile,
+                                                 const xai_pTile3D idxTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param);
+ */
+_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxNj2_S16_WHD(const xai_pTile3D inTile,
+                                                    const xai_pTile3D idxTile,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxNj2_F16_WHD(const xai_pTile3D inTile,
+                                                    const xai_pTile3D idxTile,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxN_S16_DWH(const xai_pTile3D inTile,
+                                                  const xai_pTile3D idxTile,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxN_F16_DWH(const xai_pTile3D inTile,
+                                                  const xai_pTile3D idxTile,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_pooling_params *param);
+/* RoI Max Pool Variants */
+_XAI_API_ XAI_ERR_TYPE xaiRoiMaxPool3D(const xai_pTile3D inTile,
+                                       const xai_pArray RoIParam,
+                                       xai_pTile4D outTile,
+                                       const xai_cnn_roi_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiRoiMaxPool3D_U8_DWH(const xai_pTile3D inTile,
+                                              const xai_pArray RoIParam,
+                                              xai_pTile4D outTile,
+                                              const xai_cnn_roi_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiRoiMaxPool3D_S8_DWH(const xai_pTile3D inTile,
+                                              const xai_pArray RoIParam,
+                                              xai_pTile4D outTile,
+                                              const xai_cnn_roi_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiRoiMaxPool3D_S16_DWH(const xai_pTile3D inTile,
+                                               const xai_pArray RoIParam,
+                                               xai_pTile4D outTile,
+                                               const xai_cnn_roi_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiRoiMaxPoolWithIdx3D(const xai_pTile3D inTile,
+                                              const xai_pArray RoIParam,
+                                              xai_pTile4D outTile,
+                                              xai_pTile4D idxTile,
+                                              const xai_cnn_roi_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiRoiMaxPoolWithIdx3D_U8_DWH(const xai_pTile3D inTile,
+                                                     const xai_pArray RoIParam,
+                                                     xai_pTile4D outTile,
+                                                     xai_pTile4D idxTile,
+                                                     const xai_cnn_roi_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiRoiMaxPoolWithIdx3D_S8_DWH(const xai_pTile3D inTile,
+                                                     const xai_pArray RoIParam,
+                                                     xai_pTile4D outTile,
+                                                     xai_pTile4D idxTile,
+                                                     const xai_cnn_roi_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiRoiMaxPoolWithIdx3D_S16_DWH(const xai_pTile3D inTile,
+                                                      const xai_pArray RoIParam,
+                                                      xai_pTile4D outTile,
+                                                      xai_pTile4D idxTile,
+                                                      const xai_cnn_roi_pooling_params *param);
+
+/* Average Pool */
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D(const xai_pTile3D inTile,
+                                    xai_pArray bufArray,
+                                    xai_pTile3D outTile,
+                                    const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj1_S8_WHD(const xai_pTile3D inTile,
+                                                 xai_pArray bufArray,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj1_U8_WHD(const xai_pTile3D inTile,
+                                                 xai_pArray bufArray,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj1_S16_WHD(const xai_pTile3D inTile,
+                                                  xai_pArray bufArray,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_pooling_params *param);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj1_S8U8_WHD(const xai_pTile3D inTile,
+                                                   xai_pArray bufArray,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj1_S8S16_WHD(const xai_pTile3D inTile,
+                                                    xai_pArray bufArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj1_U8S8_WHD(const xai_pTile3D inTile,
+                                                   xai_pArray bufArray,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj1_U8S16_WHD(const xai_pTile3D inTile,
+                                                    xai_pArray bufArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_S8_WHD(const xai_pTile3D inTile,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_U8_WHD(const xai_pTile3D inTile,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_S16_WHD(const xai_pTile3D inTile,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_pooling_params *param);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_S8U8_WHD(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_S8S16_WHD(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_U8S8_WHD(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_U8S16_WHD(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj2_S8_WHD(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj2_U8_WHD(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj2_S16_WHD(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj2_S8U8_WHD(const xai_pTile3D inTile,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj2_S8S16_WHD(const xai_pTile3D inTile,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj2_U8S8_WHD(const xai_pTile3D inTile,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj2_U8S16_WHD(const xai_pTile3D inTile,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_S8_DWH(const xai_pTile3D inTile,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_U8_DWH(const xai_pTile3D inTile,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_S16_DWH(const xai_pTile3D inTile,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_S8U8_DWH(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_S8S16_DWH(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_U8S16_DWH(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_U8S8_DWH(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param);
+
+/* Global Average Pool */
+_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D(const xai_pTile3D inTile,
+                                          xai_pArray bufferArray,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_global_pooling_params* param);
+
+_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_S8_WHD(const xai_pTile3D inTile,
+                                                 xai_pArray bufferArray,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_global_pooling_params* param);
+
+_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_U8_WHD(const xai_pTile3D inTile,
+                                                 xai_pArray bufferArray,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_global_pooling_params* param);
+
+_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_S8U8_WHD(const xai_pTile3D inTile,
+                                                   xai_pArray bufferArray,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_global_pooling_params* param);
+
+_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_S8S16_WHD(const xai_pTile3D inTile,
+                                                    xai_pArray bufferArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_global_pooling_params* param);
+
+_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_U8S8_WHD(const xai_pTile3D inTile,
+                                                   xai_pArray bufferArray,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_global_pooling_params* param);
+
+_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_U8S16_WHD(const xai_pTile3D inTile,
+                                                    xai_pArray bufferArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_global_pooling_params* param);
+
+_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_S16_WHD(const xai_pTile3D inTile,
+                                                  xai_pArray bufferArray,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_global_pooling_params* param);
+
+_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_S8_DWH(const xai_pTile3D inTile,
+                                                 xai_pArray bufferArray,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_global_pooling_params* param);
+
+_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_U8_DWH(const xai_pTile3D inTile,
+                                                 xai_pArray bufferArray,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_global_pooling_params* param);
+
+_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_S8U8_DWH(const xai_pTile3D inTile,
+                                                   xai_pArray bufferArray,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_global_pooling_params* param);
+
+_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_S8S16_DWH(const xai_pTile3D inTile,
+                                                    xai_pArray bufferArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_global_pooling_params* param);
+
+_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_U8S8_DWH(const xai_pTile3D inTile,
+                                                   xai_pArray bufferArray,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_global_pooling_params* param);
+
+_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_U8S16_DWH(const xai_pTile3D inTile,
+                                                    xai_pArray bufferArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_global_pooling_params* param);
+
+_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_S16_DWH(const xai_pTile3D inTile,
+                                                  xai_pArray bufferArray,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_global_pooling_params* param);
+
+/* Average pooling CNNA Variants */
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D(const xai_pTile3D inTile,
+                                     xai_pArray bufArray,
+                                     xai_pTile3D outTile,
+                                     const xai_cnn_pooling_params *param,
+                                     const xai_size3D frame3DSize);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_MxN_U8_DWH(const xai_pTile3D inTile,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_pooling_params *param,
+                                                const xai_size3D frame3DSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_MxN_S8_DWH(const xai_pTile3D inTile,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_pooling_params *param,
+                                                const xai_size3D frame3DSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_MxN_S16_DWH(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param,
+                                                 const xai_size3D frame3DSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_MxN_U8S8_DWH(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_pooling_params *param,
+                                                  const xai_size3D frame3DSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_MxN_U8S16_DWH(const xai_pTile3D inTile,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_pooling_params *param,
+                                                   const xai_size3D frame3DSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_MxN_S8U8_DWH(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_pooling_params *param,
+                                                  const xai_size3D frame3DSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_MxN_S8S16_DWH(const xai_pTile3D inTile,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_pooling_params *param,
+                                                   const xai_size3D frame3DSize);
+
+/*Adaptive Average Pool*/
+_XAI_API_ XAI_ERR_TYPE xaiAdaptiveAvgPool3D_S8_DWH(const xai_pTile3D inTile,
+                                                   const xai_pArray inTileIndexArray,
+                                                   xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiAdaptiveAvgPool3D_IX(const xai_pTile3D inTile,
+                                               const xai_pArray inTileIndexArray,
+                                               xai_pTile3D outTile);
+
+/*Adaptive MaxPool*/
+_XAI_API_ XAI_ERR_TYPE xaiAdaptiveMaxPool3D_S8_DWH(const xai_pTile3D inTile,
+                                                   const xai_pArray inTileIndexArray,
+                                                   xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiAdaptiveMaxPool3D_IX(const xai_pTile3D inTile,
+                                               const xai_pArray inTileIndexArray,
+                                               xai_pTile3D outTile);
+
+/* LRN */
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D(const xai_pTile3D inTile,
+                                     const xai_pArray lutArray,
+                                     xai_pTile3D outTile,
+                                     const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D(const xai_pTile3D inTile,
+                                       const xai_pArray lutArray,
+                                       xai_pTile3D outTile,
+                                       const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_3_U8S8_WHD(const xai_pTile3D inTile,
+                                                  const xai_pArray lutArray,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_5_U8S8_WHD(const xai_pTile3D inTile,
+                                                  const xai_pArray lutArray,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_N_U8S8_WHD(const xai_pTile3D inTile,
+                                                  const xai_pArray lutArray,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_3_U8S8_DWH(const xai_pTile3D inTile,
+                                                  const xai_pArray lutArray,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_5_U8S8_DWH(const xai_pTile3D inTile,
+                                                  const xai_pArray lutArray,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_N_U8S8_DWH(const xai_pTile3D inTile,
+                                                  const xai_pArray lutArray,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_3x3_U8S8_WHD(const xai_pTile3D inTile,
+                                                      const xai_pArray lutArray,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_5x5_U8S8_WHD(const xai_pTile3D inTile,
+                                                      const xai_pArray lutArray,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_MxN_U8S8_WHD(const xai_pTile3D inTile,
+                                                      const xai_pArray lutArray,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_3x3_U8S8_DWH(const xai_pTile3D inTile,
+                                                      const xai_pArray lutArray,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_5x5_U8S8_DWH(const xai_pTile3D inTile,
+                                                      const xai_pArray lutArray,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_MxN_U8S8_DWH(const xai_pTile3D inTile,
+                                                      const xai_pArray lutArray,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_3_U8_WHD(const xai_pTile3D inTile,
+                                                const xai_pArray lutArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_5_U8_WHD(const xai_pTile3D inTile,
+                                                const xai_pArray lutArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_N_U8_WHD(const xai_pTile3D inTile,
+                                                const xai_pArray lutArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_3_U8_DWH(const xai_pTile3D inTile,
+                                                const xai_pArray lutArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_5_U8_DWH(const xai_pTile3D inTile,
+                                                const xai_pArray lutArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_N_U8_DWH(const xai_pTile3D inTile,
+                                                const xai_pArray lutArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_3x3_U8_WHD(const xai_pTile3D inTile,
+                                                    const xai_pArray lutArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_5x5_U8_WHD(const xai_pTile3D inTile,
+                                                    const xai_pArray lutArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_MxN_U8_WHD(const xai_pTile3D inTile,
+                                                    const xai_pArray lutArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_3x3_U8_DWH(const xai_pTile3D inTile,
+                                                    const xai_pArray lutArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_5x5_U8_DWH(const xai_pTile3D inTile,
+                                                    const xai_pArray lutArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_MxN_U8_DWH(const xai_pTile3D inTile,
+                                                    const xai_pArray lutArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_3_S8_WHD(const xai_pTile3D inTile,
+                                                const xai_pArray lutArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_5_S8_WHD(const xai_pTile3D inTile,
+                                                const xai_pArray lutArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_N_S8_WHD(const xai_pTile3D inTile,
+                                                const xai_pArray lutArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_3_S8_DWH(const xai_pTile3D inTile,
+                                                const xai_pArray lutArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_5_S8_DWH(const xai_pTile3D inTile,
+                                                const xai_pArray lutArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_N_S8_DWH(const xai_pTile3D inTile,
+                                                const xai_pArray lutArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_lrn_depth_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_3x3_S8_WHD(const xai_pTile3D inTile,
+                                                    const xai_pArray lutArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_5x5_S8_WHD(const xai_pTile3D inTile,
+                                                    const xai_pArray lutArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_MxN_S8_WHD(const xai_pTile3D inTile,
+                                                    const xai_pArray lutArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_3x3_S8_DWH(const xai_pTile3D inTile,
+                                                    const xai_pArray lutArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_5x5_S8_DWH(const xai_pTile3D inTile,
+                                                    const xai_pArray lutArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_lrn_spatial_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_MxN_S8_DWH(const xai_pTile3D inTile,
+                                                    const xai_pArray lutArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_lrn_spatial_params *param);
+
+/* LUT APIs */
+
+_XAI_API_ XAI_ERR_TYPE xaiLUT3D(const xai_pTile3D inTile,
+                                const xai_pArray lutArray,
+                                xai_pTile3D outTile,
+                                const xai_cnn_lut_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Oddsym_S8S8(const xai_pTile3D inTile,
+                                            const xai_pArray lutArray,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_lut_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Evensym_S8I8(const xai_pTile3D inTile,
+                                             const xai_pArray lutArray,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_lut_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Normal_S8I8(const xai_pTile3D inTile,
+                                            const xai_pArray lutArray,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_lut_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Oddsym_S8S16(const xai_pTile3D inTile,
+                                             const xai_pArray lutArray,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_lut_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Evensym_S8I16(const xai_pTile3D inTile,
+                                              const xai_pArray lutArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_lut_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Normal_S8I16(const xai_pTile3D inTile,
+                                             const xai_pArray lutArray,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_lut_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Oddsym_S16S8(const xai_pTile3D inTile,
+                                             const xai_pArray lutArray,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_lut_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Evensym_S16I8(const xai_pTile3D inTile,
+                                              const xai_pArray lutArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_lut_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Normal_S16I8(const xai_pTile3D inTile,
+                                             const xai_pArray lutArray,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_lut_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Oddsym_S16S16(const xai_pTile3D inTile,
+                                              const xai_pArray lutArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_lut_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Evensym_S16I16(const xai_pTile3D inTile,
+                                               const xai_pArray lutArray,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_lut_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Normal_S16I16(const xai_pTile3D inTile,
+                                              const xai_pArray lutArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_lut_params *params);
+
+/* Partial Dual LUT APIs */
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialDualLUT3D_S16I16(const xai_pTile3D inTile,
+                                                  const xai_pArray lut1Array,
+                                                  const xai_pArray lut2Array,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_lut_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialDualLUT3D_Oddsym_S16S16(const xai_pTile3D inTile,
+                                                         const xai_pArray lut1Array,
+                                                         const xai_pArray lut2Array,
+                                                         xai_pTile3D outTile,
+                                                         const xai_cnn_lut_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialDualLUT3D_Evensym_S16I16(const xai_pTile3D inTile,
+                                                          const xai_pArray lut1Array,
+                                                          const xai_pArray lut2Array,
+                                                          xai_pTile3D outTile,
+                                                          const xai_cnn_lut_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialDualLUT3D_Normal_S16I16(const xai_pTile3D inTile,
+                                                         const xai_pArray lut1Array,
+                                                         const xai_pArray lut2Array,
+                                                         xai_pTile3D outTile,
+                                                         const xai_cnn_lut_params *params);
+
+/* FillTile */
+_XAI_API_ XAI_ERR_TYPE xaiFillTile3D(xai_pTile3D dstTile,
+                                     const int32_t value,
+                                     xai_bool fillEdgeExtension);
+
+_XAI_API_ XAI_ERR_TYPE xaiFillTile3D_I8(xai_pTile3D dstTile,
+                                        const int32_t value,
+                                        xai_bool fill_edge_extension);
+
+_XAI_API_ XAI_ERR_TYPE xaiFillTile3D_I16(xai_pTile3D dstTile,
+                                         const int32_t value,
+                                         xai_bool fill_edge_extension);
+
+/* Extend Edge */
+_XAI_API_ XAI_ERR_TYPE xaiExtendEdgesConst3D(xai_pTile3D dstTile,
+                                             const int32_t value,
+                                             xai_size3D frame3DSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiExtendEdgesConst3D_I8(xai_pTile3D dstTile,
+                                                const int32_t value,
+                                                xai_size3D frame3DSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiExtendEdgesConst3D_I16(xai_pTile3D dstTile,
+                                                 const int32_t value,
+                                                 xai_size3D frame3DSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiExtendEdges3D(xai_pTile3D dstTile,
+                                        const xai_pArray pArray,
+                                        xai_size3D frame3DSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiExtendEdges3D_I8(xai_pTile3D dstTile,
+                                           const xai_pArray pArray,
+                                           xai_size3D frame3DSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiExtendEdges3D_I16(xai_pTile3D dstTile,
+                                            const xai_pArray pArray,
+                                            xai_size3D frame3DSize);
+
+/* Copy Tile */
+_XAI_API_ XAI_ERR_TYPE xaiCopyTile3D(const xai_pTile3D inTile,
+                                     xai_pTile3D outTile,
+                                     xai_bool copy_edge_extension);
+
+/* Transpose */
+_XAI_API_ XAI_ERR_TYPE xaiTranspose3D(const xai_pTile3D inTile,
+                                      xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiTranspose3D_I8_WHD_DWH(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiTranspose3D_I8_WHD_DWH_Depth3(const xai_pTile3D inTile,
+                                                        xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiTranspose3D_I8_DWH_WHD(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiTranspose3D_I8_DWH_WHD_Depth3(const xai_pTile3D inTile,
+                                                        xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiTranspose3D_I16_WHD_DWH(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiTranspose3D_I16_DWH_WHD(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiTranspose3D_I32_WHD_DWH(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiTranspose3D_I32_DWH_WHD(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile);
+/*
+   _XAI_API_ XAI_ERR_TYPE xaiTranspose_I32(const xai_pArray srcArray,
+                                     xai_pArray dstArray);
+ */
+_XAI_API_ XAI_ERR_TYPE xaiTranspose3D2(const xai_pTile3D inTile,
+                                       xai_pArray bufArray,
+                                       xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiTranspose3D2_I8_DWH_WHD(const xai_pTile3D inTile,
+                                                  xai_pArray bufArray,
+                                                  xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiTranspose3D2_I16_DWH_WHD(const xai_pTile3D inTile,
+                                                   xai_pArray bufArray,
+                                                   xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiTranspose3D2_I32_DWH_WHD(const xai_pTile3D inTile,
+                                                   xai_pArray bufArray,
+                                                   xai_pTile3D outTile);
+_XAI_API_ XAI_ERR_TYPE xaiTranspose3D2_I8_WHD_DWH(const xai_pTile3D inTile,
+                                                  xai_pArray bufArray,
+                                                  xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiTranspose3D2_I16_WHD_DWH(const xai_pTile3D inTile,
+                                                   xai_pArray bufArray,
+                                                   xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiTranspose3D2_I32_WHD_DWH(const xai_pTile3D inTile,
+                                                   xai_pArray bufArray,
+                                                   xai_pTile3D outTile);
+
+/* Unsigned to Signed */
+_XAI_API_ XAI_ERR_TYPE xaiUnsignedToSigned3D_U8S8(xai_pTile3D inTile,
+                                                  xai_pTile3D outTile);
+
+/* Data Conversions */
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D(const xai_pTile3D inTile,
+                                           xai_pTile3D outTile,
+                                           const uint16_t scale,
+                                           const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S32S8(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const uint16_t scale,
+                                                 const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S32U8(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const uint16_t scale,
+                                                 const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S32S16(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const uint16_t scale,
+                                                  const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S32U16(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const uint16_t scale,
+                                                  const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S8S16(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const uint16_t scale,
+                                                 const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S8I32(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const uint16_t scale,
+                                                 const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S16I32(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const uint16_t scale,
+                                                  const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S16I64(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const uint16_t scale,
+                                                  const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U8I32(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const uint16_t scale,
+                                                 const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U16I32(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const uint16_t scale,
+                                                  const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U16I64(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const uint16_t scale,
+                                                  const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S8I64(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const uint16_t scale,
+                                                 const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U8S16(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const uint16_t scale,
+                                                 const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U8I64(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const uint16_t scale,
+                                                 const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U8U16(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const uint16_t scale,
+                                                 const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S16I8(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const uint16_t scale,
+                                                 const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U16I8(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const uint16_t scale,
+                                                 const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S16(const xai_pTile3D inTile,
+                                               xai_pTile3D outTile,
+                                               const uint16_t scale,
+                                               const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U16S16(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const uint16_t scale,
+                                                  const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U8S8(const xai_pTile3D inTile,
+                                                xai_pTile3D outTile,
+                                                const uint16_t scale,
+                                                const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S16U16(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const uint16_t scale,
+                                                  const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S8U8(const xai_pTile3D inTile,
+                                                xai_pTile3D outTile,
+                                                const uint16_t scale,
+                                                const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_FLOATIX(const xai_pTile3D inTile,
+                                                   xai_pTile3D outTile,
+                                                   const float scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_FLOATS8(const xai_pTile3D inTile,
+                                                   xai_pTile3D outTile,
+                                                   const float scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_FLOATS16(const xai_pTile3D inTile,
+                                                    xai_pTile3D outTile,
+                                                    const float scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_FLOATU16(const xai_pTile3D inTile,
+                                                    xai_pTile3D outTile,
+                                                    const float scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_FLOATU8(const xai_pTile3D inTile,
+                                                   xai_pTile3D outTile,
+                                                   const float scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_IXFLOAT(const xai_pTile3D inTile,
+                                                   xai_pTile3D outTile,
+                                                   const float scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S8FLOAT(const xai_pTile3D inTile,
+                                                   xai_pTile3D outTile,
+                                                   const float scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S16FLOAT(const xai_pTile3D inTile,
+                                                    xai_pTile3D outTile,
+                                                    const float scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U16FLOAT(const xai_pTile3D inTile,
+                                                    xai_pTile3D outTile,
+                                                    const float scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U8FLOAT(const xai_pTile3D inTile,
+                                                   xai_pTile3D outTile,
+                                                   const float scale);
+
+/* Data Conversions with Asymmetric Quantization */
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const int16_t zeroPoint,
+                                                 const uint16_t scale,
+                                                 const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8S8(const xai_pTile3D inTile,
+                                                      xai_pTile3D outTile,
+                                                      const int16_t fixUp,
+                                                      const uint16_t scale,
+                                                      const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8U8(const xai_pTile3D inTile,
+                                                      xai_pTile3D outTile,
+                                                      const int16_t fixUp,
+                                                      const uint16_t scale,
+                                                      const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8S16(const xai_pTile3D inTile,
+                                                       xai_pTile3D outTile,
+                                                       const int16_t fixUp,
+                                                       const uint16_t scale,
+                                                       const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8U16(const xai_pTile3D inTile,
+                                                       xai_pTile3D outTile,
+                                                       const int16_t fixUp,
+                                                       const uint16_t scale,
+                                                       const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8I32(const xai_pTile3D inTile,
+                                                       xai_pTile3D outTile,
+                                                       const int16_t zeroIn,
+                                                       const uint16_t scale,
+                                                       const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8I64(const xai_pTile3D inTile,
+                                                       xai_pTile3D outTile,
+                                                       const int16_t zeroIn,
+                                                       const uint16_t scale,
+                                                       const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_U8S8(const xai_pTile3D inTile,
+                                                      xai_pTile3D outTile,
+                                                      const int16_t zeroOut,
+                                                      const uint16_t scale,
+                                                      const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S16S8(const xai_pTile3D inTile,
+                                                       xai_pTile3D outTile,
+                                                       const int16_t zeroOut,
+                                                       const uint16_t scale,
+                                                       const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_U16S8(const xai_pTile3D inTile,
+                                                       xai_pTile3D outTile,
+                                                       const int16_t zeroOut,
+                                                       const uint16_t scale,
+                                                       const uint8_t shift);
+
+// Temporary prototype definition, to be removed later
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U16AS8(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const int16_t zeroOut,
+                                                  const uint16_t scale,
+                                                  const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S32S8(const xai_pTile3D inTile,
+                                                       xai_pTile3D outTile,
+                                                       const int16_t zeroOut,
+                                                       const uint16_t scale,
+                                                       const uint8_t shift);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8FLOAT(const xai_pTile3D inTile,
+                                                         xai_pTile3D outTile,
+                                                         const float scale,
+                                                         const int16_t zeroPoint);
+
+_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_FLOATS8(const xai_pTile3D inTile,
+                                                         xai_pTile3D outTile,
+                                                         const float scale,
+                                                         const int16_t zeroPoint);
+/* ReOrg */
+_XAI_API_ XAI_ERR_TYPE xaiReOrg3D(const xai_pTile3D inTile,
+                                  xai_pTile3D outTile,
+                                  const xai_cnn_reorg_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReOrg3D_I8_WHD(const xai_pTile3D inTile,
+                                         xai_pTile3D outTile,
+                                         const xai_cnn_reorg_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReOrg3D_I16_WHD(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_reorg_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReOrg3D_I8_DWH(const xai_pTile3D inTile,
+                                         xai_pTile3D outTile,
+                                         const xai_cnn_reorg_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReOrg3D_I16_DWH(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_reorg_params *params);
+
+/* ReOrg4D */
+_XAI_API_ XAI_ERR_TYPE xaiReOrg4DBatchSpace(const xai_pTile4D inTile,
+                                            xai_pTile4D outTile,
+                                            const xai_cnn_reorg4D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReOrg4DBatchSpace_I8_WHDN(const xai_pTile4D inTile,
+                                                    xai_pTile4D outTile,
+                                                    const xai_cnn_reorg4D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReOrg4DBatchSpace_I16_WHDN(const xai_pTile4D inTile,
+                                                     xai_pTile4D outTile,
+                                                     const xai_cnn_reorg4D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReOrg4DBatchSpace_I8_DWHN(const xai_pTile4D inTile,
+                                                    xai_pTile4D outTile,
+                                                    const xai_cnn_reorg4D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReOrg4DBatchSpace_I16_DWHN(const xai_pTile4D inTile,
+                                                     xai_pTile4D outTile,
+                                                     const xai_cnn_reorg4D_params *params);
+
+/* ReOrg Caffe*/
+/*_XAI_API_ XAI_ERR_TYPE xaiReOrgCaffe3D_I8_DWH(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_reorg_params *params);
+   _XAI_API_ XAI_ERR_TYPE xaiReOrgCaffe3D_I16_DWH(const xai_pTile3D inTile,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_reorg_params *params);
+ */
+/* Renormalisation */
+_XAI_API_ XAI_ERR_TYPE xaiRenorm3D(const xai_pTile3D inTile,
+                                   xai_pTile3D outTile,
+                                   const uint16_t renormScale,
+                                   const uint8_t renormShift);
+
+_XAI_API_ XAI_ERR_TYPE xaiRenorm3D_S8(const xai_pTile3D inTile,
+                                      xai_pTile3D outTile,
+                                      const uint16_t renormScale,
+                                      const uint8_t renormShift);
+
+_XAI_API_ XAI_ERR_TYPE xaiRenorm3D_U8(const xai_pTile3D inTile,
+                                      xai_pTile3D outTile,
+                                      const uint16_t renormScale,
+                                      const uint8_t renormShift);
+
+_XAI_API_ XAI_ERR_TYPE xaiRenorm3D_S16(const xai_pTile3D inTile,
+                                       xai_pTile3D outTile,
+                                       const uint16_t renormScale,
+                                       const uint8_t renormShift);
+
+_XAI_API_ XAI_ERR_TYPE xaiRenormVQ3D_S16_WHD(const xai_pTile3D inTile,
+                                             const xai_pArray scaleArray,
+                                             xai_pTile3D outTile,
+                                             const uint8_t renormShift);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiRenormVQ3D_S16_DWH(const xai_pTile3D inTile,
+                                             const xai_pArray scaleArray,
+                                             xai_pTile3D outTile,
+                                             const uint8_t renormShift);
+_XAI_API_ XAI_ERR_TYPE xaiRenorm3D_U16(const xai_pTile3D inTile,
+                                       xai_pTile3D outTile,
+                                       const uint16_t renormScale,
+                                       const uint8_t renormShift);
+
+_XAI_API_ XAI_ERR_TYPE xaiRenormVQ3D_U16_WHD(const xai_pTile3D inTile,
+                                             const xai_pArray scaleArray,
+                                             xai_pTile3D outTile,
+                                             const uint8_t renormShift);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiRenormVQ3D_U16_DWH(const xai_pTile3D inTile,
+                                             const xai_pArray scaleArray,
+                                             xai_pTile3D outTile,
+                                             const uint8_t renormShift);
+
+_XAI_API_ XAI_ERR_TYPE xaiRenorm3D2(const xai_pTile3D inTile,
+                                    xai_pTile3D outTile,
+                                    const xai_cnn_renorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiRenorm3D2_S8(const xai_pTile3D inTile,
+                                       xai_pTile3D outTile,
+                                       const xai_cnn_renorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiRenorm3D2_U8(const xai_pTile3D inTile,
+                                       xai_pTile3D outTile,
+                                       const xai_cnn_renorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiRenorm3D2_S16(const xai_pTile3D inTile,
+                                        xai_pTile3D outTile,
+                                        const xai_cnn_renorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiRenorm3D2_U16(const xai_pTile3D inTile,
+                                        xai_pTile3D outTile,
+                                        const xai_cnn_renorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiRenorm3D2_AsymQ_S8(const xai_pTile3D inTile,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_renorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiRenorm3D2_AsymQ_U8S8(const xai_pTile3D inTile,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_renorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiRenorm3D2_AsymQ_S8U8(const xai_pTile3D inTile,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_renorm_params *params);
+/* Interp Variants */
+
+_XAI_API_ XAI_ERR_TYPE xaiInterp3D(const xai_pTile3D inTile,
+                                   xai_pTile3D outTile,
+                                   const xai_cnn_interp3D_params *params);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiInterp3D_U8_WHD(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_interp3D_params *params);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiInterp3D_S8_WHD(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_interp3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiInterp3D_S16_WHD(const xai_pTile3D inTile,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_interp3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiInterp3D_U8S16_WHD(const xai_pTile3D inTile,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_interp3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiInterp3D_S8S16_WHD(const xai_pTile3D inTile,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_interp3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiInterp3D_U8_DWH(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_interp3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiInterp3D_S8_DWH(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_interp3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiInterp3D_S16_DWH(const xai_pTile3D inTile,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_interp3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiInterp3D_U8S16_DWH(const xai_pTile3D inTile,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_interp3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiInterp3D_S8S16_DWH(const xai_pTile3D inTile,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_interp3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiInterp3D_SetTileParams(const xai_size3D *inFrame3DSize,
+                                                 const xai_size3D *outFrame3DSize,
+                                                 const xai_cnn_data_order dataOrder,
+                                                 int32_t half_pixel_flag,
+                                                 xai_cnn_interp3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_SetTileParams(const xai_size3D *inFrame3DSize,
+                                                        const xai_size3D *outFrame3DSize,
+                                                        const xai_cnn_data_order dataOrder,
+                                                        xai_cnn_resize_nearest3D_params *params);
+
+/* ResizeNearest variants */
+
+_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_resize_nearest3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_S8_WHD(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_resize_nearest3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_S16_WHD(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_resize_nearest3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_U8_WHD(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_resize_nearest3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_S8U8_WHD(const xai_pTile3D inTile,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_resize_nearest3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_S8_DWH(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_resize_nearest3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_S16_DWH(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_resize_nearest3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_U8_DWH(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_resize_nearest3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_S8U8_DWH(const xai_pTile3D inTile,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_resize_nearest3D_params *params);
+
+/* RELU */
+_XAI_API_ XAI_ERR_TYPE xaiLeakyRELU(const xai_pTile3D inTile,
+                                    xai_pTile3D outTile,
+                                    const XAI_Q15 slope);
+
+_XAI_API_ XAI_ERR_TYPE xaiLeakyRELU_S8(const xai_pTile3D inTile,
+                                       xai_pTile3D outTile,
+                                       const XAI_Q15 slope);
+
+_XAI_API_ XAI_ERR_TYPE xaiLeakyRELU_S16(const xai_pTile3D inTile,
+                                        xai_pTile3D outTile,
+                                        const XAI_Q15 slope);
+
+_XAI_API_ XAI_ERR_TYPE xaiLeakyRELU_S16S8(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const XAI_Q15 slope);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiRELU(const xai_pTile3D inTile,
+                               xai_pTile3D outTile,
+                               const uint8_t minVal,
+                               const uint8_t maxVal);
+
+_XAI_API_ XAI_ERR_TYPE xaiRELU_U8(const xai_pTile3D inTile,
+                                  xai_pTile3D outTile,
+                                  const uint8_t minVal,
+                                  const uint8_t maxVal);
+
+_XAI_API_ XAI_ERR_TYPE xaiRELU_S8U8(const xai_pTile3D inTile,
+                                    xai_pTile3D outTile,
+                                    const uint8_t minVal,
+                                    const uint8_t maxVal);
+
+/* PRELU*/
+_XAI_API_ XAI_ERR_TYPE xaiPRELU3D_S8_WHD(const xai_pTile3D inTile,
+                                         const xai_pTile3D slopeArray,
+                                         xai_pTile3D outTile,
+                                         const uint8_t outputShift);
+
+_XAI_API_ XAI_ERR_TYPE xaiPRELU3D_S16_WHD(const xai_pTile3D inTile,
+                                          const xai_pTile3D slopeArray,
+                                          xai_pTile3D outTile,
+                                          const uint8_t outputShift);
+
+_XAI_API_ XAI_ERR_TYPE xaiPRELU3D_S8_DWH(const xai_pTile3D inTile,
+                                         const xai_pTile3D slopeArray,
+                                         xai_pTile3D outTile,
+                                         const uint8_t outputShift);
+
+_XAI_API_ XAI_ERR_TYPE xaiPRELU3D_S16_DWH(const xai_pTile3D inTile,
+                                          const xai_pTile3D slopeArray,
+                                          xai_pTile3D outTile,
+                                          const uint8_t outputShift);
+
+_XAI_API_ XAI_ERR_TYPE xaiPRELU3D(const xai_pTile3D inTile,
+                                  const xai_pTile3D slopeArray,
+                                  xai_pTile3D outTile,
+                                  const uint8_t outputShift);
+
+_XAI_API_ XAI_ERR_TYPE xaiRELUScale(const xai_pTile3D inTile,
+                                    xai_pTile3D outTile,
+                                    const int16_t scale,
+                                    const uint8_t shift,
+                                    const int8_t offset,
+                                    const uint8_t minVal,
+                                    const uint8_t maxVal);
+
+_XAI_API_ XAI_ERR_TYPE xaiRELUScale_S8U8(const xai_pTile3D inTile,
+                                         xai_pTile3D outTile,
+                                         const int16_t scale,
+                                         const uint8_t shift,
+                                         const int8_t offset,
+                                         const uint8_t minVal,
+                                         const uint8_t maxVal);
+
+_XAI_API_ XAI_ERR_TYPE xaiRELU16(const xai_pTile3D inTile,
+                                 xai_pTile3D outTile,
+                                 const int32_t minVal,
+                                 const int32_t maxVal);
+
+_XAI_API_ XAI_ERR_TYPE xaiRELU16_S16I16(const xai_pTile3D inTile,
+                                        xai_pTile3D outTile,
+                                        const int32_t minVal,
+                                        const int32_t maxVal);
+
+_XAI_API_ XAI_ERR_TYPE xaiRELU16_U16I16(const xai_pTile3D inTile,
+                                        xai_pTile3D outTile,
+                                        const int32_t minVal,
+                                        const int32_t maxVal);
+
+/* Modified Relu for BN + Depthwise Clip operation */
+_XAI_API_ XAI_ERR_TYPE xaiChannelwiseClip(const xai_pTile3D inTile,
+                                          const xai_pArray thresholdMax,
+                                          const xai_pArray thresholdMin,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_relu_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiChannelwiseClip_S8_DWH(const xai_pTile3D inTile,
+                                                 const xai_pArray thresholdMax,
+                                                 const xai_pArray thresholdMin,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_relu_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiChannelwiseClip_S8_WHD(const xai_pTile3D inTile,
+                                                 const xai_pArray thresholdMax,
+                                                 const xai_pArray thresholdMin,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_relu_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiChannelwiseClip_S16_DWH(const xai_pTile3D inTile,
+                                                  const xai_pArray thresholdMax,
+                                                  const xai_pArray thresholdMin,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_relu_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiChannelwiseClip_S16_WHD(const xai_pTile3D inTile,
+                                                  const xai_pArray thresholdMax,
+                                                  const xai_pArray thresholdMin,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_relu_params *params);
+
+/* Batchnorm */
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S8_WHD(const xai_pTile3D inTile,
+                                             const xai_pArray alphaArray,
+                                             const xai_pArray betaArray,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8_WHD(const xai_pTile3D inTile,
+                                             const xai_pArray alphaArray,
+                                             const xai_pArray betaArray,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S8S16_WHD(const xai_pTile3D inTile,
+                                                const xai_pArray alphaArray,
+                                                const xai_pArray betaArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8S8_WHD(const xai_pTile3D inTile,
+                                               const xai_pArray alphaArray,
+                                               const xai_pArray betaArray,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8S16_WHD(const xai_pTile3D inTile,
+                                                const xai_pArray alphaArray,
+                                                const xai_pArray betaArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S16_WHD(const xai_pTile3D inTile,
+                                              const xai_pArray alphaArray,
+                                              const xai_pArray betaArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S8_DWH(const xai_pTile3D inTile,
+                                             const xai_pArray alphaArray,
+                                             const xai_pArray betaArray,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8_DWH(const xai_pTile3D inTile,
+                                             const xai_pArray alphaArray,
+                                             const xai_pArray betaArray,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8S8_DWH(const xai_pTile3D inTile,
+                                               const xai_pArray alphaArray,
+                                               const xai_pArray betaArray,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8S16_DWH(const xai_pTile3D inTile,
+                                                const xai_pArray alphaArray,
+                                                const xai_pArray betaArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S8S16_DWH(const xai_pTile3D inTile,
+                                                const xai_pArray alphaArray,
+                                                const xai_pArray betaArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S16_DWH(const xai_pTile3D inTile,
+                                              const xai_pArray alphaArray,
+                                              const xai_pArray betaArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D(const xai_pTile3D inTile,
+                                      const xai_pArray alphaArray,
+                                      const xai_pArray betaArray,
+                                      xai_pTile3D outTile,
+                                      const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S8_Dim2(const xai_pTile3D inTile,
+                                              const xai_pArray alphaArray,
+                                              const xai_pArray betaArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S16_Dim2(const xai_pTile3D inTile,
+                                               const xai_pArray alphaArray,
+                                               const xai_pArray betaArray,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S8U8_Dim2(const xai_pTile3D inTile,
+                                                const xai_pArray alphaArray,
+                                                const xai_pArray betaArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S8S16_Dim2(const xai_pTile3D inTile,
+                                                 const xai_pArray alphaArray,
+                                                 const xai_pArray betaArray,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8_Dim2(const xai_pTile3D inTile,
+                                              const xai_pArray alphaArray,
+                                              const xai_pArray betaArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8S8_Dim2(const xai_pTile3D inTile,
+                                                const xai_pArray alphaArray,
+                                                const xai_pArray betaArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8S16_Dim2(const xai_pTile3D inTile,
+                                                 const xai_pArray alphaArray,
+                                                 const xai_pArray betaArray,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_Dim2(const xai_pTile3D inTile,
+                                           const xai_pArray alphaArray,
+                                           const xai_pArray betaArray,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_batchnorm_params *params);
+
+/* ArgMax */
+_XAI_API_ XAI_ERR_TYPE xaiArgmax_S8(const xai_pTile3D inTile,
+                                    xai_pTile3D outTileIdx,
+                                    xai_pTile3D outTileVal,
+                                    xai_pTile2D extraValCnt,
+                                    xai_pArray sortedIdxArr,
+                                    xai_pArray sortedValArr,
+                                    const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmin_S8(const xai_pTile3D inTile,
+                                    xai_pTile3D outTileIdx,
+                                    xai_pTile3D outTileVal,
+                                    xai_pTile2D extraValCnt,
+                                    xai_pArray sortedIdxArr,
+                                    xai_pArray sortedValArr,
+                                    const uint16_t numSmallestVal);
+
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_dim1(const xai_pTile3D inTile,
+                                        xai_pArray bufArray,
+                                        xai_pTile3D outTileIdx,
+                                        xai_pTile3D outTileVal,
+                                        const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_dim1(const xai_pTile3D inTile,
+                                        xai_pArray bufArray,
+                                        xai_pTile3D outTileIdx,
+                                        xai_pTile3D outTileVal,
+                                        const uint16_t numSmallestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_S8_dim1(const xai_pTile3D inTile,
+                                           xai_pArray bufArray,
+                                           xai_pTile3D outTileIdx,
+                                           xai_pTile3D outTileVal,
+                                           const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_S8_dim1(const xai_pTile3D inTile,
+                                           xai_pArray bufArray,
+                                           xai_pTile3D outTileIdx,
+                                           xai_pTile3D outTileVal,
+                                           const uint16_t numSmallestVal);
+
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_U8_dim1(const xai_pTile3D inTile,
+                                           xai_pArray bufArray,
+                                           xai_pTile3D outTileIdx,
+                                           xai_pTile3D outTileVal,
+                                           const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_U8_dim1(const xai_pTile3D inTile,
+                                           xai_pArray bufArray,
+                                           xai_pTile3D outTileIdx,
+                                           xai_pTile3D outTileVal,
+                                           const uint16_t numSmallestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_S16_dim1(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_S16_dim1(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numSmallestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_U16_dim1(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_U16_dim1(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numSmallestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_dim2(const xai_pTile3D inTile,
+                                        xai_pArray bufArray,
+                                        xai_pTile3D outTileIdx,
+                                        xai_pTile3D outTileVal,
+                                        const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_dim2(const xai_pTile3D inTile,
+                                        xai_pArray bufArray,
+                                        xai_pTile3D outTileIdx,
+                                        xai_pTile3D outTileVal,
+                                        const uint16_t numSmallestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_S8_dim2(const xai_pTile3D inTile,
+                                           xai_pArray bufArray,
+                                           xai_pTile3D outTileIdx,
+                                           xai_pTile3D outTileVal,
+                                           const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_S8_dim2(const xai_pTile3D inTile,
+                                           xai_pArray bufArray,
+                                           xai_pTile3D outTileIdx,
+                                           xai_pTile3D outTileVal,
+                                           const uint16_t numSmallestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_U8_dim2(const xai_pTile3D inTile,
+                                           xai_pArray bufArray,
+                                           xai_pTile3D outTileIdx,
+                                           xai_pTile3D outTileVal,
+                                           const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_U8_dim2(const xai_pTile3D inTile,
+                                           xai_pArray bufArray,
+                                           xai_pTile3D outTileIdx,
+                                           xai_pTile3D outTileVal,
+                                           const uint16_t numSmallestVal);
+
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_S16_dim2(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_S16_dim2(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numSmallestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_U16_dim2(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_U16_dim2(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numSmallestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_dim3(const xai_pTile3D inTile,
+                                        xai_pArray bufArray,
+                                        xai_pTile3D outTileIdx,
+                                        xai_pTile3D outTileVal,
+                                        const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_dim3(const xai_pTile3D inTile,
+                                        xai_pArray bufArray,
+                                        xai_pTile3D outTileIdx,
+                                        xai_pTile3D outTileVal,
+                                        const uint16_t numSmallestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_S8_dim3(const xai_pTile3D inTile,
+                                           xai_pArray bufArray,
+                                           xai_pTile3D outTileIdx,
+                                           xai_pTile3D outTileVal,
+                                           const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_S8_dim3(const xai_pTile3D inTile,
+                                           xai_pArray bufArray,
+                                           xai_pTile3D outTileIdx,
+                                           xai_pTile3D outTileVal,
+                                           const uint16_t numSmallestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_U8_dim3(const xai_pTile3D inTile,
+                                           xai_pArray bufArray,
+                                           xai_pTile3D outTileIdx,
+                                           xai_pTile3D outTileVal,
+                                           const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_U8_dim3(const xai_pTile3D inTile,
+                                           xai_pArray bufArray,
+                                           xai_pTile3D outTileIdx,
+                                           xai_pTile3D outTileVal,
+                                           const uint16_t numSmallestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_S16_dim3(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_S16_dim3(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numSmallestVal);
+
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_U16_dim3(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_U16_dim3(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numSmallestVal);
+/*argmax merger variants*/
+_XAI_API_ XAI_ERR_TYPE xaiMergeTopKArgmax3D_S8_dim1(const xai_pTile3D inTileIdx,
+                                                    const xai_pTile3D inTileVal,
+                                                    const xai_pArray inPtrOffsetArr,
+                                                    xai_pArray bufArray,
+                                                    xai_pTile3D outTileIdx,
+                                                    xai_pTile3D outTileVal,
+                                                    const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiMergeTopKArgmax3D_U8_dim1(const xai_pTile3D inTileIdx,
+                                                    const xai_pTile3D inTileVal,
+                                                    const xai_pArray inPtrOffsetArr,
+                                                    xai_pArray bufArray,
+                                                    xai_pTile3D outTileIdx,
+                                                    xai_pTile3D outTileVal,
+                                                    const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiMergeTopKArgmax3D_S16_dim1(const xai_pTile3D inTileIdx,
+                                                     const xai_pTile3D inTileVal,
+                                                     const xai_pArray inPtrOffsetArr,
+                                                     xai_pArray bufArray,
+                                                     xai_pTile3D outTileIdx,
+                                                     xai_pTile3D outTileVal,
+                                                     const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiMergeTopKArgmax3D_U16_dim1(const xai_pTile3D inTileIdx,
+                                                     const xai_pTile3D inTileVal,
+                                                     const xai_pArray inPtrOffsetArr,
+                                                     xai_pArray bufArray,
+                                                     xai_pTile3D outTileIdx,
+                                                     xai_pTile3D outTileVal,
+                                                     const uint16_t numLargestVal);
+_XAI_API_ XAI_ERR_TYPE xaiMergeTopKArgmax3D_dim1(const xai_pTile3D inTileIdx,
+                                                 const xai_pTile3D inTileVal,
+                                                 const xai_pArray inPtrOffsetArr,
+                                                 xai_pArray bufArray,
+                                                 xai_pTile3D outTileIdx,
+                                                 xai_pTile3D outTileVal,
+                                                 const uint16_t numLargestVal);
+/* SoftMax Variants */
+
+/* 1D variant */
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax_S16U16(const xai_pArray input,
+                                         const xai_pArray lutArray,
+                                         xai_pArray output,
+                                         const xai_cnn_softmax_params *params);
+
+/* 3D variant */
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcMaxval3D_S8(const xai_pTile3D inTile,
+                                          xai_pArray maxValArr,
+                                          xai_cnn_maxval_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcMaxval3D_S16(const xai_pTile3D inTile,
+                                           xai_pArray maxValArr,
+                                           xai_cnn_maxval_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Dim1(const xai_pTile3D input,
+                                         const xai_pArray lutArray, xai_pArray buffArray, xai_pTile3D output,
+                                         const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8U8_Dim1(const xai_pTile3D inTile,
+                                              const xai_pArray lutArray,
+                                              xai_pArray buffArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8AS8_Dim1(const xai_pTile3D inTile,
+                                               const xai_pArray lutArray,
+                                               xai_pArray buffArray,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8AS8_Dim2(const xai_pTile3D input,
+                                               const xai_pArray lutArray,
+                                               xai_pArray bufArray,
+                                               xai_pTile3D output,
+                                               const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8AS8_Dim3(const xai_pTile3D inTile,
+                                               const xai_pArray lutArray,
+                                               xai_pArray bufArray,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_softmax_params  *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8AS8(const xai_pTile3D inTile,
+                                          const xai_pArray lutArray,
+                                          xai_pArray buffArray,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S16U16_Dim1(const xai_pTile3D inTile,
+                                                const xai_pArray lutArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S16U16_Dim2(const xai_pTile3D inTile,
+                                                const xai_pArray lutArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_softmax_params  *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8U8_Dim3(const xai_pTile3D inTile,
+                                              const xai_pArray lutArray,
+                                              xai_pArray bufArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_softmax_params  *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Dim3(const xai_pTile3D inTile,
+                                         const xai_pArray lutArray,
+                                         xai_pArray bufArray,
+                                         xai_pTile3D outTile,
+                                         const xai_cnn_softmax_params  *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S16U16_Dim3(const xai_pTile3D inTile,
+                                                const xai_pArray lutArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_softmax_params  *params);
+
+/* Faster performing immplementation of Softmax3D along Dim1*/
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Dim1_fast(const xai_pTile3D inTile,
+                                              const xai_pArray lutArray,
+                                              xai_pArray buffArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_softmax_params *params);
+
+/* Faster performing immplementation of Softmax3D along Dim2*/
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Dim2_fast(const xai_pTile3D inTile,
+                                              const xai_pArray lutArray,
+                                              xai_pArray buffArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_softmax_params *params);
+
+/* Faster performing immplementation of Softmax3D along Dim3*/
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Dim3_fast(const xai_pTile3D inTile,
+                                              const xai_pArray lutArray,
+                                              xai_pArray buffArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8U8_Dim1_fast(const xai_pTile3D inTile,
+                                                   const xai_pArray lutArray,
+                                                   xai_pArray buffArray,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_softmax_params *params);
+
+/* Faster performing immplementation of Softmax3D along Dim1*/
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S16U16_Dim1_fast(const xai_pTile3D inTile,
+                                                     const xai_pArray lutArray,
+                                                     xai_pTile3D outTile,
+                                                     const xai_cnn_softmax_params *params);
+
+/* Faster performing immplementation of Softmax3D along Dim2*/
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S16U16_Dim2_fast(const xai_pTile3D inTile,
+                                                     const xai_pArray lutArray,
+                                                     xai_pTile3D outTile,
+                                                     const xai_cnn_softmax_params  *params);
+
+/* Faster performing immplementation of Softmax3D along Dim3*/
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S16U16_Dim3_fast(const xai_pTile3D inTile,
+                                                     const xai_pArray lutArray,
+                                                     xai_pTile3D outTile,
+                                                     const xai_cnn_softmax_params  *params);
+
+/* Input 3D MxN */
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Mclasses(const xai_pTile3D input,
+                                             const xai_pArray lutArray,
+                                             xai_pArray buffArray,
+                                             xai_pTile3D output,
+                                             const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Mclasses_S16U16(const xai_pTile3D input,
+                                                    const xai_pArray lutArray,
+                                                    xai_pTile3D output,
+                                                    const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Mclasses_S8U8(const xai_pTile3D input,
+                                                  const xai_pArray lutArray,
+                                                  xai_pArray buffArray,
+                                                  xai_pTile3D output,
+                                                  const xai_cnn_softmax_params *params);
+
+/* Input 3D NxM */
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Ndata(const xai_pTile3D input,
+                                          const xai_pArray lutArray,
+                                          xai_pArray bufArray,
+                                          xai_pTile3D output,
+                                          const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Ndata_S8U8(const xai_pTile3D input,
+                                               const xai_pArray lutArray,
+                                               xai_pArray bufArray,
+                                               xai_pTile3D output,
+                                               const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S8U8_Dim2(const xai_pTile3D input,
+                                                   const xai_pArray lutArray,
+                                                   xai_pArray bufArray,
+                                                   xai_pTile3D output,
+                                                   const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8U8_Dim2(const xai_pTile3D input,
+                                              const xai_pArray lutArray,
+                                              xai_pArray bufArray,
+                                              xai_pTile3D output,
+                                              const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Dim2(const xai_pTile3D input,
+                                         const xai_pArray lutArray,
+                                         xai_pArray bufArray,
+                                         xai_pTile3D output,
+                                         const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8U8_Dim2_fast(const xai_pTile3D input,
+                                                   const xai_pArray lutArray,
+                                                   xai_pArray bufArray,
+                                                   xai_pTile3D output,
+                                                   const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8U8_Dim3_fast(const xai_pTile3D input,
+                                                   const xai_pArray lutArray,
+                                                   xai_pArray bufArray,
+                                                   xai_pTile3D output,
+                                                   const xai_cnn_softmax_params *params);
+
+/* Input 3D NxM */
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_Dim1(const xai_pTile3D input,
+                                              const xai_pArray lutArray, xai_pArray buffArray, xai_pTile3D output,
+                                              const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_Dim2(const xai_pTile3D input,
+                                              const xai_pArray lutArray, xai_pArray buffArray, xai_pTile3D output,
+                                              const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_Dim3(const xai_pTile3D input,
+                                              const xai_pArray lutArray, xai_pArray buffArray, xai_pTile3D output,
+                                              const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Ndata_S16U16(const xai_pTile3D input,
+                                                 const xai_pArray lutArray,
+                                                 xai_pTile3D output,
+                                                 const xai_cnn_softmax_params *params);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S8U8_Dim1(const xai_pTile3D inTile,
+                                                   const xai_pArray lutArray,
+                                                   xai_pArray bufArray,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S16U16_Dim1(const xai_pTile3D inTile,
+                                                     const xai_pArray lutArray,
+                                                     xai_pTile3D outTile,
+                                                     const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S16U16_Dim2(const xai_pTile3D inTile,
+                                                     const xai_pArray lutArray,
+                                                     xai_pTile3D outTile,
+                                                     const xai_cnn_softmax_params  *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S8U8_Dim3(const xai_pTile3D inTile,
+                                                   const xai_pArray lutArray,
+                                                   xai_pArray buffArray,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S16U16_Dim3(const xai_pTile3D inTile,
+                                                     const xai_pArray lutArray,
+                                                     xai_pTile3D outTile,
+                                                     const xai_cnn_softmax_params  *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_Dim1_fast(const xai_pTile3D input,
+                                                   const xai_pArray lutArray, xai_pArray buffArray, xai_pTile3D output,
+                                                   const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_Dim2_fast(const xai_pTile3D input,
+                                                   const xai_pArray lutArray, xai_pArray buffArray, xai_pTile3D output,
+                                                   const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_Dim3_fast(const xai_pTile3D input,
+                                                   const xai_pArray lutArray, xai_pArray buffArray, xai_pTile3D output,
+                                                   const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S8U8_Dim1_fast(const xai_pTile3D inTile,
+                                                        const xai_pArray lutArray,
+                                                        xai_pArray buffArray,
+                                                        xai_pTile3D outTile,
+                                                        const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S16U16_Dim1_fast(const xai_pTile3D inTile,
+                                                          const xai_pArray lutArray,
+                                                          xai_pTile3D outTile,
+                                                          const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S8U8_Dim2_fast(const xai_pTile3D inTile,
+                                                        const xai_pArray lutArray,
+                                                        xai_pArray buffArray,
+                                                        xai_pTile3D outTile,
+                                                        const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S16U16_Dim2_fast(const xai_pTile3D inTile,
+                                                          const xai_pArray lutArray,
+                                                          xai_pTile3D outTile,
+                                                          const xai_cnn_softmax_params  *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S8U8_Dim3_fast(const xai_pTile3D inTile,
+                                                        const xai_pArray lutArray,
+                                                        xai_pArray buffArray,
+                                                        xai_pTile3D outTile,
+                                                        const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S16U16_Dim3_fast(const xai_pTile3D inTile,
+                                                          const xai_pArray lutArray,
+                                                          xai_pTile3D outTile,
+                                                          const xai_cnn_softmax_params  *params);
+
+/* Input 3D MxN */
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_Mclasses_S16U16(const xai_pTile3D input,
+                                                         const xai_pArray lutArray,
+                                                         xai_pTile3D output,
+                                                         const xai_cnn_softmax_params *params);
+
+/* Input 3D NxM */
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_Ndata_S16U16(const xai_pTile3D input,
+                                                      const xai_pArray lutArray,
+                                                      xai_pTile3D output,
+                                                      const xai_cnn_softmax_params *params);
+
+/* Softmax 8-bit variant */
+/* 1D variant */
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax_S8U8(const xai_pArray input,
+                                       const xai_pArray lutArray,
+                                       xai_pArray buffArray,
+                                       xai_pArray output,
+                                       const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax(const xai_pArray input,
+                                  const xai_pArray lutArray,
+                                  xai_pArray buffArray,
+                                  xai_pArray output,
+                                  const xai_cnn_softmax_params *params);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S8U8_Dim1(const xai_pTile3D inTile,
+                                                    const xai_pTile3D maskTile,
+                                                    const xai_pArray lutArray,
+                                                    xai_pArray buffArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S8AS8_Dim1(const xai_pTile3D inTile,
+                                                     const xai_pTile3D maskTile,
+                                                     const xai_pArray lutArray,
+                                                     xai_pArray buffArray,
+                                                     xai_pTile3D outTile,
+                                                     const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S16U16_Dim1(const xai_pTile3D inTile,
+                                                      const xai_pTile3D maskTile,
+                                                      const xai_pArray lutArray,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S8U8_Dim2(const xai_pTile3D inTile,
+                                                    const xai_pTile3D maskTile,
+                                                    const xai_pArray lutArray,
+                                                    xai_pArray buffArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S8AS8_Dim2(const xai_pTile3D inTile,
+                                                     const xai_pTile3D maskTile,
+                                                     const xai_pArray lutArray,
+                                                     xai_pArray buffArray,
+                                                     xai_pTile3D outTile,
+                                                     const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S16U16_Dim2(const xai_pTile3D inTile,
+                                                      const xai_pTile3D maskTile,
+                                                      const xai_pArray lutArray,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S8U8_Dim3(const xai_pTile3D inTile,
+                                                    const xai_pTile3D maskTile,
+                                                    const xai_pArray lutArray,
+                                                    xai_pArray buffArray,
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S8AS8_Dim3(const xai_pTile3D inTile,
+                                                     const xai_pTile3D maskTile,
+                                                     const xai_pArray lutArray,
+                                                     xai_pArray buffArray,
+                                                     xai_pTile3D outTile,
+                                                     const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S16U16_Dim3(const xai_pTile3D inTile,
+                                                      const xai_pTile3D maskTile,
+                                                      const xai_pArray lutArray,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_softmax_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D(const xai_pTile3D inTile,
+                                          const xai_pTile3D maskTile,
+                                          const xai_pArray lutArray,
+                                          xai_pArray buffArray,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_softmax_params *params);
+
+
+/*Sigmoid3D functions*/
+
+_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D(const xai_pTile3D inTile,
+                                    const xai_pArray lutArray,
+                                    xai_pTile3D outTile,
+                                    const int16_t shift,
+                                    const int16_t scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D_S8U8(const xai_pTile3D inTile,
+                                         const xai_pArray lutArray,
+                                         xai_pTile3D outTile,
+                                         const int16_t shift,
+                                         const int16_t scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D_S8AS8(const xai_pTile3D inTile,
+                                          const xai_pArray lutArray,
+                                          xai_pTile3D outTile,
+                                          const int16_t shift,
+                                          const int16_t scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D_S8(const xai_pTile3D inTile,
+                                       const xai_pArray lutArray,
+                                       xai_pTile3D outTile,
+                                       const int16_t shift,
+                                       const int16_t scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D_S16U8(const xai_pTile3D inTile,
+                                          const xai_pArray lutArray,
+                                          xai_pTile3D outTile,
+                                          const int16_t shift,
+                                          const int16_t scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D_S16S8(const xai_pTile3D inTile,
+                                          const xai_pArray lutArray,
+                                          xai_pTile3D outTile,
+                                          const int16_t shift,
+                                          const int16_t scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D_S16U16(const xai_pTile3D inTile,
+                                           const xai_pArray lutArray,
+                                           xai_pTile3D outTile,
+                                           const int16_t shift,
+                                           const int16_t scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D_S16(const xai_pTile3D inTile,
+                                        const xai_pArray lutArray,
+                                        xai_pTile3D outTile,
+                                        const int16_t shift,
+                                        const int16_t scale);
+
+/*Tanh3D hyperbolic functions*/
+
+
+_XAI_API_ XAI_ERR_TYPE xaiTanh3D(const xai_pTile3D inTile,
+                                 const xai_pArray lutArray,
+                                 xai_pTile3D outTile,
+                                 const int16_t shift,
+                                 const int16_t scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiTanh3D_S8(const xai_pTile3D inTile,
+                                    const xai_pArray lutArray,
+                                    xai_pTile3D outTile,
+                                    const int16_t shift,
+                                    const int16_t scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiTanh3D_S16S8(const xai_pTile3D inTile,
+                                       const xai_pArray lutArray,
+                                       xai_pTile3D outTile,
+                                       const int16_t shift,
+                                       const int16_t scale);
+
+_XAI_API_ XAI_ERR_TYPE xaiTanh3D_S16(const xai_pTile3D inTile,
+                                     const xai_pArray lutArray,
+                                     xai_pTile3D outTile,
+                                     const int16_t shift,
+                                     const int16_t scale);
+
+
+/* Eltwise Add */
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D(const xai_pTile3D inTile1,
+                                       const xai_pTile3D inTile2,
+                                       xai_pTile3D outTile,
+                                       const xai_cnn_eltwise_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j1_S8I8(const xai_pTile3D inTile1,
+                                               const xai_pTile3D inTile2,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_eltwise_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j1_U8(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_eltwise_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j1_S8U8I8(const xai_pTile3D inTile1,
+                                                 const xai_pTile3D inTile2,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_eltwise_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j1_S16I16(const xai_pTile3D inTile1,
+                                                 const xai_pTile3D inTile2,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_eltwise_params *param);
+
+/* Eltwise Add j2 variants */
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j2_S8I8_DWH(const xai_pTile3D inTile1,
+                                                   const xai_pTile3D inTile2,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_eltwise_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j2_U8_DWH(const xai_pTile3D inTile1,
+                                                 const xai_pTile3D inTile2,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_eltwise_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j2_S16I16_DWH(const xai_pTile3D inTile1,
+                                                     const xai_pTile3D inTile2,
+                                                     xai_pTile3D outTile,
+                                                     const xai_cnn_eltwise_params *param);
+
+/* Eltwise Add j1j2 variants */
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j1j2_S8I8_DWH(const xai_pTile3D inTile1,
+                                                     const xai_pTile3D inTile2,
+                                                     xai_pTile3D outTile,
+                                                     const xai_cnn_eltwise_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j1j2_U8_DWH(const xai_pTile3D inTile1,
+                                                   const xai_pTile3D inTile2,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_eltwise_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j1j2_S16I16_DWH(const xai_pTile3D inTile1,
+                                                       const xai_pTile3D inTile2,
+                                                       xai_pTile3D outTile,
+                                                       const xai_cnn_eltwise_params *param);
+
+/* Eltwise Subtraction */
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D(const xai_pTile3D inTile1,
+                                       const xai_pTile3D inTile2,
+                                       xai_pTile3D outTile,
+                                       const xai_cnn_eltwise_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_j1_S8I8(const xai_pTile3D inTile1,
+                                               const xai_pTile3D inTile2,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_eltwise_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_j1_S16I16(const xai_pTile3D inTile1,
+                                                 const xai_pTile3D inTile2,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_eltwise_params *param);
+
+/* Eltwise Multiply */
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D(const xai_pTile3D inTile1,
+                                       const xai_pTile3D inTile2,
+                                       xai_pTile3D outTile,
+                                       const xai_cnn_eltwiseMul_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_S8(const xai_pTile3D inTile1,
+                                          const xai_pTile3D inTile2,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_eltwiseMul_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_S8S16(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_eltwiseMul_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_S8U8S8(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_eltwiseMul_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_S8U8U8(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_eltwiseMul_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_S8U8S16(const xai_pTile3D inTile1,
+                                               const xai_pTile3D inTile2,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_eltwiseMul_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_U8I8(const xai_pTile3D inTile1,
+                                            const xai_pTile3D inTile2,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_eltwiseMul_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_U8S16(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_eltwiseMul_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_S16(const xai_pTile3D inTile1,
+                                           const xai_pTile3D inTile2,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_eltwiseMul_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_U16(const xai_pTile3D inTile1,
+                                           const xai_pTile3D inTile2,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_eltwiseMul_params *param);
+
+/* Eltwise Exponent */
+
+_XAI_API_ XAI_ERR_TYPE xaiExp3D(const xai_pTile3D inTile,
+                                const xai_pArray lutArray,
+                                xai_pTile3D outTile,
+                                const xai_cnn_exponent_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiExp3D_S16(const xai_pTile3D inTile,
+                                    const xai_pArray lutArray,
+                                    xai_pTile3D outTile,
+                                    const xai_cnn_exponent_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiExp3D_S16U16(const xai_pTile3D inTile,
+                                       const xai_pArray lutArray,
+                                       xai_pTile3D outTile,
+                                       const xai_cnn_exponent_params *params);
+
+
+/* Maxout */
+_XAI_API_ XAI_ERR_TYPE xaiMaxout3D(const xai_pTile3D inTile,
+                                   xai_pTile3D outTile,
+                                   const uint16_t kSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxout3D_S8_WHD(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const uint16_t kSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxout3D_S8_DWH(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const uint16_t kSize);
+
+/* Mean subtraction */
+_XAI_API_ XAI_ERR_TYPE xaiMeanSubtraction3D_U8S8(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const uint8_t mean,
+                                                 const uint16_t scale,
+                                                 const uint8_t shift);
+/* Generate LUT for LRN */
+_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_generateLut(xai_pArray lutArray,
+                                                   xai_cnn_lrn_spatial_params *params,
+                                                   float alpha,
+                                                   float beta,
+                                                   float kValue,
+                                                   int32_t maxSumOfSquares,
+                                                   float qIn,
+                                                   float qOut);
+
+_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_generateLut(xai_pArray lutArray,
+                                                 xai_cnn_lrn_depth_params *params,
+                                                 float alpha,
+                                                 float beta,
+                                                 float kValue,
+                                                 int32_t maxSumOfSquares,
+                                                 float qIn,
+                                                 float qOut);
+
+/* Generate LUT*/
+_XAI_API_ XAI_ERR_TYPE xaiTanh_generateLut(xai_pArray lutArray,
+                                           const int32_t inpDataType,
+                                           const uint8_t lutQfactor,
+                                           const float qIn);
+
+_XAI_API_ XAI_ERR_TYPE xaiTanh3D_generateLut(const xai_pTile3D inTile,
+                                             xai_pArray lutArray,
+                                             const uint16_t tanh_cutoff);
+
+_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D_generateLut(const xai_pTile3D inTile,
+                                                xai_pArray lutArray,
+                                                const uint16_t sigmoidCutoff);
+
+_XAI_API_ XAI_ERR_TYPE xaiSigmoid_generateLut(xai_pArray lutArray,
+                                              const int32_t inpDataType,
+                                              const uint8_t lutQfactor,
+                                              const float qIn);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax_generateLut_S16(xai_pArray lutArray,
+                                                  xai_cnn_softmax_params *params,
+                                                  const uint16_t qFactorLUT,
+                                                  const float qIn);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax_generateLut_S8(xai_pArray lutArray,
+                                                 xai_cnn_softmax_params *params,
+                                                 const uint16_t qFactorLUT,
+                                                 const float qIn
+                                                 );
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftmax_generateLut(xai_pArray lutArray,
+                                              const xai_pTile3D input, xai_cnn_softmax_params *params,
+                                              const uint16_t qFactorLUT,
+                                              const float qIn
+                                              );
+_XAI_API_ XAI_ERR_TYPE xaiExp_generateLUT(float inScaleF, int inQPDepth, float outScaleF,
+                                          int outQPDepth, xai_pArray tables,
+                                          xai_cnn_exponent_params *params, const xai_pArray qXBits,
+                                          const xai_pArray qYBits);
+
+_XAI_API_ XAI_ERR_TYPE xaiStdDevRecip_generateLut(xai_pArray rSqrtTable,
+                                                  const xai_dataType dataType);
+
+/* Wrappper Functions */
+_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_1(const xai_pTile3D inTile,
+                                           xai_pTile3D outTile,
+                                           void *function2DPtr);
+
+_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_2(const xai_pTile3D inTile,
+                                           xai_pTile3D outTile,
+                                           int32_t value,
+                                           void *function2DPtr);
+
+_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_3(const xai_pTile3D inTile,
+                                           xai_pTile3D outTile,
+                                           xai_pArray pArray,
+                                           void *function2DPtr);
+
+_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_4(const xai_pTile3D inTile0,
+                                           const xai_pTile3D inTile1,
+                                           xai_pTile3D outTile,
+                                           void *function2DPtr);
+
+_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_5(const xai_pTile3D inTile0,
+                                           const xai_pTile3D inTile1,
+                                           xai_pTile3D outTile,
+                                           int32_t value,
+                                           void *function2DPtr);
+
+_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_6(const xai_pTile3D inTile,
+                                           xai_pTile3D outTile,
+                                           xai_pTile2D tmpTile,
+                                           int32_t value,
+                                           void *function2DPtr);
+
+_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_7(const xai_pTile3D inTile,
+                                           int32_t *counter,
+                                           void *function2DPtr);
+
+_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_8(const xai_pTile3D inTile,
+                                           int32_t value,
+                                           int32_t *counter,
+                                           void *function2DPtr);
+
+_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_9(const xai_pTile3D inTile,
+                                           xai_pTile3D outTile,
+                                           XAI_Q13_18 xscale,
+                                           XAI_Q13_18 yscale,
+                                           XAI_Q13_18 xshift,
+                                           XAI_Q13_18 yshift,
+                                           void *function2DPtr);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiDeConvGetDim4D_WHDN(const xai_pTile4D coeffTile,
+                                              xai_pTile4D subCoeffInfo[],
+                                              uint16_t *numSubKernels,
+                                              const uint8_t strideX,
+                                              const uint8_t strideY,
+                                              const uint8_t getNumKernelsFlag);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeConvGetDim3D_WHD(const xai_pTile3D coeffTile,
+                                             xai_pTile3D subCoeffInfo[],
+                                             uint16_t *numSubKernels,
+                                             const uint8_t strideX,
+                                             const uint8_t strideY,
+                                             const uint8_t getNumKernelsFlag);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeConvReOrder4D_I8_WHDN(const xai_pTile4D inTile,
+                                                  xai_pTile4D subCoeffs[],
+                                                  const xai_cnn_conv_params *param,
+                                                  const uint8_t transposeCoeffsFlag);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeConvReOrder3D_I8_WHD(const xai_pTile3D inTile,
+                                                 xai_pTile3D subCoeffs[],
+                                                 const xai_cnn_depthwiseDilatedConv_params *param,
+                                                 const uint8_t transposeCoeffsFlag);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeConvInterleave3D_I8_WHD(const xai_pTile3D inTile[],
+                                                    xai_pTile3D outTile,
+                                                    const xai_cnn_conv_params *convParams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseDeConvInterleave3D_I8_WHD(const xai_pTile3D inTile[],
+                                                             xai_pTile3D outTile,
+                                                             const xai_cnn_depthwiseDilatedConv_params *convParams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeConvInterleave3D_I16_WHD(const xai_pTile3D inTile[],
+                                                     xai_pTile3D outTile,
+                                                     const xai_cnn_conv_params *convParams);
+
+_XAI_API_ XAI_ERR_TYPE xaiBiasExtend_S32_MOD(const xai_pArray inBiasArray,
+                                             xai_pArray outBiasArray);
+
+_XAI_API_ XAI_ERR_TYPE xaiOutScaleExtend_U16_MOD(const xai_pArray outScaleArray,
+                                                 xai_pArray extendedOutScaleArray);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeConvGetDim4D_NDWH(const xai_pTile4D coeffTile,
+                                              xai_pTile4D subCoeffInfo[],
+                                              xai_pTile4D superCoeffInfo[],
+                                              uint16_t *numSubKernels,
+                                              uint16_t *numSuperKernels,
+                                              const uint8_t strideX,
+                                              const uint8_t strideY,
+                                              const uint8_t getNumKernelsFlag);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeConvGetDim3D_DWH(const xai_pTile3D coeffTile,
+                                             xai_pTile3D subCoeffInfo[],
+                                             uint16_t *numSubKernels,
+                                             const uint8_t strideX,
+                                             const uint8_t strideY,
+                                             const uint8_t getNumKernelsFlag);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeConvReOrder4D_I8_NDWH(const xai_pTile4D inTile,
+                                                  xai_pTile4D subCoeffs[],
+                                                  xai_pTile4D superCoeffs[],
+                                                  const xai_cnn_conv_params *param,
+                                                  const uint8_t transposeCoeffsFlag);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeConvReOrder3D_I8_DWH(const xai_pTile3D inTile,
+                                                 xai_pTile3D subCoeffs[],
+                                                 const xai_cnn_depthwiseDilatedConv_params *param,
+                                                 const uint8_t transposeCoeffsFlag);
+
+/*Permute Functions*/
+
+_XAI_API_ XAI_ERR_TYPE xaiPermute4D(const xai_pTile4D inTile,
+                                    xai_pTile4D outTile,
+                                    const xai_cnn_permute4D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiPermute4D_I8(const xai_pTile4D inTile,
+                                       xai_pTile4D outTile,
+                                       const xai_cnn_permute4D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiPermute4D_I16(const xai_pTile4D inTile,
+                                        xai_pTile4D outTile,
+                                        const xai_cnn_permute4D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiPermute4D_I32(const xai_pTile4D inTile,
+                                        xai_pTile4D outTile,
+                                        const xai_cnn_permute4D_params* params);
+
+_XAI_API_ XAI_ERR_TYPE xaiPermute4D2(const xai_pTile4D inTile,
+                                     xai_pArray bufArray,
+                                     xai_pTile4D outTile,
+                                     const xai_cnn_permute4D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiPermute4D2_I8(const xai_pTile4D inTile,
+                                        xai_pArray bufArray,
+                                        xai_pTile4D outTile,
+                                        const xai_cnn_permute4D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiPermute4D2_I16(const xai_pTile4D inTile,
+                                         xai_pArray bufArray,
+                                         xai_pTile4D outTile,
+                                         const xai_cnn_permute4D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiPermute4D2_I32(const xai_pTile4D inTile,
+                                         xai_pArray bufArray,
+                                         xai_pTile4D outTile,
+                                         const xai_cnn_permute4D_params *params);
+
+/*Shuffle variants*/
+
+_XAI_API_ XAI_ERR_TYPE xaiShuffle3D(const xai_pTile3D inTile,
+                                    xai_pTile3D outTile,
+                                    const xai_cnn_shuffle3D_params *shuffParams);
+
+_XAI_API_ XAI_ERR_TYPE xaiShuffle3D_I8_DWH(const xai_pTile3D inTile,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_shuffle3D_params *shuffParams);
+
+_XAI_API_ XAI_ERR_TYPE xaiShuffle3D_I16_DWH(const xai_pTile3D inTile,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_shuffle3D_params *shuffParams);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiShuffle3D_I8_WHD(const xai_pTile3D inTile,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_shuffle3D_params *shuffParams);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiShuffle3D_I16_WHD(const xai_pTile3D inTile,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_shuffle3D_params *shuffParams);
+
+
+/* Calc Normalize Wrapper Function */
+_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_I8(const xai_pTile3D pInTile,
+                                                   const xai_pArray rSqrtTable,
+                                                   const xai_pArray recipTable,
+                                                   xai_pArray buffArrSoS,
+                                                   xai_pArray buffNSAShiftArray,
+                                                   xai_pArray pNormScaleArr,
+                                                   const xai_cnn_normalize3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_S16(const xai_pTile3D pInTile,
+                                                    const xai_pArray rSqrtTable,
+                                                    xai_pArray buffArrSoS,
+                                                    xai_pArray buffNSAShiftArray,
+                                                    xai_pArray pNormScaleArr,
+                                                    const xai_cnn_normalize3D_params *params);
+
+/* Calc Normalize Variants*/
+_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_S8_WHD(const xai_pTile3D pInTile,
+                                                       const xai_pArray rSqrtTable,
+                                                       const xai_pArray recipTable,
+                                                       xai_pArray buffArrSoS,
+                                                       xai_pArray buffNSAShiftArray,
+                                                       xai_pArray pNormScaleArr,
+                                                       const xai_cnn_normalize3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_U8_WHD(const xai_pTile3D pInTile,
+                                                       const xai_pArray rSqrtTable,
+                                                       const xai_pArray recipTable,
+                                                       xai_pArray buffArrSoS,
+                                                       xai_pArray buffNSAShiftArray,
+                                                       xai_pArray pNormScaleArr,
+                                                       const xai_cnn_normalize3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_S16_WHD(const xai_pTile3D pInTile,
+                                                        const xai_pArray rSqrtTable,
+                                                        xai_pArray buffArrSoS,
+                                                        xai_pArray buffNSAShiftArray,
+                                                        xai_pArray pNormScaleArr,
+                                                        const xai_cnn_normalize3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_S8_DWH(const xai_pTile3D pInTile,
+                                                       const xai_pArray rSqrtTable,
+                                                       const xai_pArray recipTable,
+                                                       xai_pArray buffArrSoS,
+                                                       xai_pArray buffNSAShiftArray,
+                                                       xai_pArray pNormScaleArr,
+                                                       const xai_cnn_normalize3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_U8_DWH(const xai_pTile3D pInTile,
+                                                       const xai_pArray rSqrtTable,
+                                                       const xai_pArray recipTable,
+                                                       xai_pArray buffArrSoS,
+                                                       xai_pArray buffNSAShiftArray,
+                                                       xai_pArray pNormScaleArr,
+                                                       const xai_cnn_normalize3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_S16_DWH(const xai_pTile3D pInTile,
+                                                        const xai_pArray rSqrtTable,
+                                                        xai_pArray buffArrSoS,
+                                                        xai_pArray buffNSAShiftArray,
+                                                        xai_pArray pNormScaleArr,
+                                                        const xai_cnn_normalize3D_params *params);
+
+/* Apply Scale Wrapper Function */
+_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_I8(const xai_pTile3D InTile,
+                                          const xai_pArray pNormScaleArr,
+                                          const xai_pArray pQuantScaleTable,
+										  const xai_pArray buffNSAShiftArray,
+                                          xai_pTile3D pOutTile,
+                                          const xai_cnn_normalize3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_S16(const xai_pTile3D InTile,
+                                           const xai_pArray pNormScaleArr,
+                                           const xai_pArray pQuantScaleTable,
+                                           const xai_pArray buffNSAShiftArray,
+                                           xai_pTile3D pOutTile,
+                                           const xai_cnn_normalize3D_params *params);
+
+/* Apply Scale Variants */
+_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_S8_WHD(const xai_pTile3D inTile,
+                                              const xai_pArray pNormScaleArr,
+                                              const xai_pArray pQuantScaleTable,
+											  const xai_pArray buffNSAShiftArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_normalize3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_U8_WHD(const xai_pTile3D inTile,
+                                              const xai_pArray pNormScaleArr,
+                                              const xai_pArray pQuantScaleTable,
+											  const xai_pArray buffNSAShiftArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_normalize3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_S16_WHD(const xai_pTile3D inTile,
+                                               const xai_pArray pNormScaleArr,
+                                               const xai_pArray pQuantScaleTable,
+                                               const xai_pArray buffNSAShiftArray,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_normalize3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_S8_DWH(const xai_pTile3D inTile,
+                                              const xai_pArray pNormScaleArr,
+                                              const xai_pArray pQuantScaleTable,
+											  const xai_pArray buffNSAShiftArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_normalize3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_U8_DWH(const xai_pTile3D inTile,
+                                              const xai_pArray pNormScaleArr,
+                                              const xai_pArray pQuantScaleTable,
+											  const xai_pArray buffNSAShiftArray,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_normalize3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_S16_DWH(const xai_pTile3D inTile,
+                                               const xai_pArray pNormScaleArr,
+                                               const xai_pArray pQuantScaleTable,
+                                               const xai_pArray buffNSAShiftArray,
+                                               xai_pTile3D outTile,
+                                               const xai_cnn_normalize3D_params *params);
+
+/*Generate LUT for normalize variants*/
+_XAI_API_ XAI_ERR_TYPE xaiNormalize3D_generateLut(xai_pArray rSqrtTable,
+                                                  xai_pArray recipTable,
+                                                  const xai_cnn_normalize3D_params *params,
+                                                  const xai_dataType dataType);
+
+_XAI_API_ XAI_ERR_TYPE xaiNormalize3D_generateLut_S16(xai_pArray rSqrtTable,
+                                                      const xai_cnn_normalize3D_params *params,
+                                                      const xai_dataType dataType);
+
+/* Instance Normalization API ref */
+
+/* calcInstanceNorm APIs */
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D(const xai_pTile3D inTile,
+                                                   xai_pArray meanArr,
+                                                   xai_pArray recipArr,
+                                                   xai_pArray buffArr,
+                                                   xai_pArray buffArrSoS,
+                                                   const xai_pArray rSqrtTable,
+                                                   const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S8_WHD(const xai_pTile3D inTile,
+                                                          xai_pArray meanArr,
+                                                          xai_pArray recipArr,
+                                                          xai_pArray buffArr,
+                                                          xai_pArray buffArrSoS,
+                                                          const xai_pArray rSqrtTable,
+                                                          const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_U8_WHD(const xai_pTile3D inTile,
+                                                          xai_pArray meanArr,
+                                                          xai_pArray recipArr,
+                                                          xai_pArray buffArr,
+                                                          xai_pArray buffArrSoS,
+                                                          const xai_pArray rSqrtTable,
+                                                          const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S16_WHD(const xai_pTile3D inTile,
+                                                           xai_pArray meanArr,
+                                                           xai_pArray recipArr,
+                                                           xai_pArray buffArr,
+                                                           xai_pArray buffArrSoS,
+                                                           const xai_pArray rSqrtTable,
+                                                           const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S8_DWH(const xai_pTile3D inTile,
+                                                          xai_pArray meanArr,
+                                                          xai_pArray recipArr,
+                                                          xai_pArray buffArr,
+                                                          xai_pArray buffArrSoS,
+                                                          const xai_pArray rSqrtTable,
+                                                          const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_U8_DWH(const xai_pTile3D inTile,
+                                                          xai_pArray meanArr,
+                                                          xai_pArray recipArr,
+                                                          xai_pArray buffArr,
+                                                          xai_pArray buffArrSoS,
+                                                          const xai_pArray rSqrtTable,
+                                                          const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S16_DWH(const xai_pTile3D inTile,
+                                                           xai_pArray meanArr,
+                                                           xai_pArray recipArr,
+                                                           xai_pArray buffArr,
+                                                           xai_pArray buffArrSoS,
+                                                           const xai_pArray rSqrtTable,
+                                                           const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S8_Dim1(const xai_pTile3D inTile,
+                                                           xai_pArray meanArr,
+                                                           xai_pArray recipArr,
+                                                           xai_pArray buffArr,
+                                                           xai_pArray buffArrSoS,
+                                                           const xai_pArray rSqrtTable,
+                                                           const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S8_Dim2(const xai_pTile3D inTile,
+                                                           xai_pArray meanArr,
+                                                           xai_pArray recipArr,
+                                                           xai_pArray buffArr,
+                                                           xai_pArray buffArrSoS,
+                                                           const xai_pArray rSqrtTable,
+                                                           const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S8_Dim3(const xai_pTile3D inTile,
+                                                           xai_pArray meanArr,
+                                                           xai_pArray recipArr,
+                                                           xai_pArray buffArr,
+                                                           xai_pArray buffArrSoS,
+                                                           const xai_pArray rSqrtTable,
+                                                           const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_U8_Dim1(const xai_pTile3D inTile,
+                                                           xai_pArray meanArr,
+                                                           xai_pArray recipArr,
+                                                           xai_pArray buffArr,
+                                                           xai_pArray buffArrSoS,
+                                                           const xai_pArray rSqrtTable,
+                                                           const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_U8_Dim2(const xai_pTile3D inTile,
+                                                           xai_pArray meanArr,
+                                                           xai_pArray recipArr,
+                                                           xai_pArray buffArr,
+                                                           xai_pArray buffArrSoS,
+                                                           const xai_pArray rSqrtTable,
+                                                           const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_U8_Dim3(const xai_pTile3D inTile,
+                                                           xai_pArray meanArr,
+                                                           xai_pArray recipArr,
+                                                           xai_pArray buffArr,
+                                                           xai_pArray buffArrSoS,
+                                                           const xai_pArray rSqrtTable,
+                                                           const xai_cnn_instance_norm_param *params);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S16_Dim1(const xai_pTile3D inTile,
+                                                            xai_pArray meanArr,
+                                                            xai_pArray recipArr,
+                                                            xai_pArray buffArr,
+                                                            xai_pArray buffArrSoS,
+                                                            const xai_pArray rSqrtTable,
+                                                            const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S16_Dim2(const xai_pTile3D inTile,
+                                                            xai_pArray meanArr,
+                                                            xai_pArray recipArr,
+                                                            xai_pArray buffArr,
+                                                            xai_pArray buffArrSoS,
+                                                            const xai_pArray rSqrtTable,
+                                                            const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S16_Dim3(const xai_pTile3D inTile,
+                                                            xai_pArray meanArr,
+                                                            xai_pArray recipArr,
+                                                            xai_pArray buffArr,
+                                                            xai_pArray buffArrSoS,
+                                                            const xai_pArray rSqrtTable,
+                                                            const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_Dim1(const xai_pTile3D inTile,
+                                                        xai_pArray meanArr,
+                                                        xai_pArray recipArr,
+                                                        xai_pArray buffArr,
+                                                        xai_pArray buffArrSoS,
+                                                        const xai_pArray rSqrtTable,
+                                                        const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_Dim2(const xai_pTile3D inTile,
+                                                        xai_pArray meanArr,
+                                                        xai_pArray recipArr,
+                                                        xai_pArray buffArr,
+                                                        xai_pArray buffArrSoS,
+                                                        const xai_pArray rSqrtTable,
+                                                        const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_Dim3(const xai_pTile3D inTile,
+                                                        xai_pArray meanArr,
+                                                        xai_pArray recipArr,
+                                                        xai_pArray buffArr,
+                                                        xai_pArray buffArrSoS,
+                                                        const xai_pArray rSqrtTable,
+                                                        const xai_cnn_instance_norm_param *params);
+
+/* applyInstanceNorm APIs */
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D(const xai_pTile3D inTile,
+                                              xai_pArray meanArr,
+                                              xai_pArray recipArr,
+                                              const xai_pArray alphaArr,
+                                              const xai_pArray betaArr,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8_WHD(const xai_pTile3D inTile,
+                                                     xai_pArray meanArr,
+                                                     xai_pArray recipArr,
+                                                     const xai_pArray alphaArr,
+                                                     const xai_pArray betaArr,
+                                                     xai_pTile3D outTile,
+                                                     const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8_WHD(const xai_pTile3D inTile,
+                                                     xai_pArray meanArr,
+                                                     xai_pArray recipArr,
+                                                     const xai_pArray alphaArr,
+                                                     const xai_pArray betaArr,
+                                                     xai_pTile3D outTile,
+                                                     const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8U8_WHD(const xai_pTile3D inTile,
+                                                       xai_pArray meanArr,
+                                                       xai_pArray recipArr,
+                                                       const xai_pArray alphaArr,
+                                                       const xai_pArray betaArr,
+                                                       xai_pTile3D outTile,
+                                                       const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S8_WHD(const xai_pTile3D inTile,
+                                                       xai_pArray meanArr,
+                                                       xai_pArray recipArr,
+                                                       const xai_pArray alphaArr,
+                                                       const xai_pArray betaArr,
+                                                       xai_pTile3D outTile,
+                                                       const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8S16_WHD(const xai_pTile3D inTile,
+                                                        xai_pArray meanArr,
+                                                        xai_pArray recipArr,
+                                                        const xai_pArray alphaArr,
+                                                        const xai_pArray betaArr,
+                                                        xai_pTile3D outTile,
+                                                        const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S16_WHD(const xai_pTile3D inTile,
+                                                        xai_pArray meanArr,
+                                                        xai_pArray recipArr,
+                                                        const xai_pArray alphaArr,
+                                                        const xai_pArray betaArr,
+                                                        xai_pTile3D outTile,
+                                                        const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S16_WHD(const xai_pTile3D inTile,
+                                                      xai_pArray meanArr,
+                                                      xai_pArray recipArr,
+                                                      const xai_pArray alphaArr,
+                                                      const xai_pArray betaArr,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8_DWH(const xai_pTile3D inTile,
+                                                     xai_pArray meanArr,
+                                                     xai_pArray recipArr,
+                                                     const xai_pArray alphaArr,
+                                                     const xai_pArray betaArr,
+                                                     xai_pTile3D outTile,
+                                                     const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8_DWH(const xai_pTile3D inTile,
+                                                     xai_pArray meanArr,
+                                                     xai_pArray recipArr,
+                                                     const xai_pArray alphaArr,
+                                                     const xai_pArray betaArr,
+                                                     xai_pTile3D outTile,
+                                                     const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8U8_DWH(const xai_pTile3D inTile,
+                                                       xai_pArray meanArr,
+                                                       xai_pArray recipArr,
+                                                       const xai_pArray alphaArr,
+                                                       const xai_pArray betaArr,
+                                                       xai_pTile3D outTile,
+                                                       const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S8_DWH(const xai_pTile3D inTile,
+                                                       xai_pArray meanArr,
+                                                       xai_pArray recipArr,
+                                                       const xai_pArray alphaArr,
+                                                       const xai_pArray betaArr,
+                                                       xai_pTile3D outTile,
+                                                       const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8S16_DWH(const xai_pTile3D inTile,
+                                                        xai_pArray meanArr,
+                                                        xai_pArray recipArr,
+                                                        const xai_pArray alphaArr,
+                                                        const xai_pArray betaArr,
+                                                        xai_pTile3D outTile,
+                                                        const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S16_DWH(const xai_pTile3D inTile,
+                                                        xai_pArray meanArr,
+                                                        xai_pArray recipArr,
+                                                        const xai_pArray alphaArr,
+                                                        const xai_pArray betaArr,
+                                                        xai_pTile3D outTile,
+                                                        const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S16_DWH(const xai_pTile3D inTile,
+                                                      xai_pArray meanArr,
+                                                      xai_pArray recipArr,
+                                                      const xai_pArray alphaArr,
+                                                      const xai_pArray betaArr,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8_Dim1(const xai_pTile3D inTile,
+                                                      xai_pArray meanArr,
+                                                      xai_pArray recipArr,
+                                                      const xai_pArray alphaArr,
+                                                      const xai_pArray betaArr,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8_Dim1(const xai_pTile3D inTile,
+                                                      xai_pArray meanArr,
+                                                      xai_pArray recipArr,
+                                                      const xai_pArray alphaArr,
+                                                      const xai_pArray betaArr,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8U8_Dim1(const xai_pTile3D inTile,
+                                                        xai_pArray meanArr,
+                                                        xai_pArray recipArr,
+                                                        const xai_pArray alphaArr,
+                                                        const xai_pArray betaArr,
+                                                        xai_pTile3D outTile,
+                                                        const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8S16_Dim1(const xai_pTile3D inTile,
+                                                         xai_pArray meanArr,
+                                                         xai_pArray recipArr,
+                                                         const xai_pArray alphaArr,
+                                                         const xai_pArray betaArr,
+                                                         xai_pTile3D outTile,
+                                                         const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S8_Dim1(const xai_pTile3D inTile,
+                                                        xai_pArray meanArr,
+                                                        xai_pArray recipArr,
+                                                        const xai_pArray alphaArr,
+                                                        const xai_pArray betaArr,
+                                                        xai_pTile3D outTile,
+                                                        const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S16_Dim1(const xai_pTile3D inTile,
+                                                         xai_pArray meanArr,
+                                                         xai_pArray recipArr,
+                                                         const xai_pArray alphaArr,
+                                                         const xai_pArray betaArr,
+                                                         xai_pTile3D outTile,
+                                                         const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S16_Dim1(const xai_pTile3D inTile,
+                                                       xai_pArray meanArr,
+                                                       xai_pArray recipArr,
+                                                       const xai_pArray alphaArr,
+                                                       const xai_pArray betaArr,
+                                                       xai_pTile3D outTile,
+                                                       const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8_Dim2(const xai_pTile3D inTile,
+                                                      xai_pArray meanArr,
+                                                      xai_pArray recipArr,
+                                                      const xai_pArray alphaArr,
+                                                      const xai_pArray betaArr,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8_Dim2(const xai_pTile3D inTile,
+                                                      xai_pArray meanArr,
+                                                      xai_pArray recipArr,
+                                                      const xai_pArray alphaArr,
+                                                      const xai_pArray betaArr,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8U8_Dim2(const xai_pTile3D inTile,
+                                                        xai_pArray meanArr,
+                                                        xai_pArray recipArr,
+                                                        const xai_pArray alphaArr,
+                                                        const xai_pArray betaArr,
+                                                        xai_pTile3D outTile,
+                                                        const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8S16_Dim2(const xai_pTile3D inTile,
+                                                         xai_pArray meanArr,
+                                                         xai_pArray recipArr,
+                                                         const xai_pArray alphaArr,
+                                                         const xai_pArray betaArr,
+                                                         xai_pTile3D outTile,
+                                                         const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S8_Dim2(const xai_pTile3D inTile,
+                                                        xai_pArray meanArr,
+                                                        xai_pArray recipArr,
+                                                        const xai_pArray alphaArr,
+                                                        const xai_pArray betaArr,
+                                                        xai_pTile3D outTile,
+                                                        const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S16_Dim2(const xai_pTile3D inTile,
+                                                         xai_pArray meanArr,
+                                                         xai_pArray recipArr,
+                                                         const xai_pArray alphaArr,
+                                                         const xai_pArray betaArr,
+                                                         xai_pTile3D outTile,
+                                                         const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S16_Dim2(const xai_pTile3D inTile,
+                                                       xai_pArray meanArr,
+                                                       xai_pArray recipArr,
+                                                       const xai_pArray alphaArr,
+                                                       const xai_pArray betaArr,
+                                                       xai_pTile3D outTile,
+                                                       const xai_cnn_instance_norm_param * params);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8_Dim3(const xai_pTile3D inTile,
+                                                      xai_pArray meanArr,
+                                                      xai_pArray recipArr,
+                                                      const xai_pArray alphaArr,
+                                                      const xai_pArray betaArr,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8_Dim3(const xai_pTile3D inTile,
+                                                      xai_pArray meanArr,
+                                                      xai_pArray recipArr,
+                                                      const xai_pArray alphaArr,
+                                                      const xai_pArray betaArr,
+                                                      xai_pTile3D outTile,
+                                                      const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8U8_Dim3(const xai_pTile3D inTile,
+                                                        xai_pArray meanArr,
+                                                        xai_pArray recipArr,
+                                                        const xai_pArray alphaArr,
+                                                        const xai_pArray betaArr,
+                                                        xai_pTile3D outTile,
+                                                        const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8S16_Dim3(const xai_pTile3D inTile,
+                                                         xai_pArray meanArr,
+                                                         xai_pArray recipArr,
+                                                         const xai_pArray alphaArr,
+                                                         const xai_pArray betaArr,
+                                                         xai_pTile3D outTile,
+                                                         const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S8_Dim3(const xai_pTile3D inTile,
+                                                        xai_pArray meanArr,
+                                                        xai_pArray recipArr,
+                                                        const xai_pArray alphaArr,
+                                                        const xai_pArray betaArr,
+                                                        xai_pTile3D outTile,
+                                                        const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S16_Dim3(const xai_pTile3D inTile,
+                                                         xai_pArray meanArr,
+                                                         xai_pArray recipArr,
+                                                         const xai_pArray alphaArr,
+                                                         const xai_pArray betaArr,
+                                                         xai_pTile3D outTile,
+                                                         const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S16_Dim3(const xai_pTile3D inTile,
+                                                       xai_pArray meanArr,
+                                                       xai_pArray recipArr,
+                                                       const xai_pArray alphaArr,
+                                                       const xai_pArray betaArr,
+                                                       xai_pTile3D outTile,
+                                                       const xai_cnn_instance_norm_param * params);
+
+
+/*Wrapper function for xaiApplyInstanceNorm3D_Dim*/
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_Dim1(const xai_pTile3D inTile,
+                                                   xai_pArray meanArr,
+                                                   xai_pArray recipArr,
+                                                   const xai_pArray alphaArr,
+                                                   const xai_pArray betaArr,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_Dim2(const xai_pTile3D inTile,
+                                                   xai_pArray meanArr,
+                                                   xai_pArray recipArr,
+                                                   const xai_pArray alphaArr,
+                                                   const xai_pArray betaArr,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_Dim3(const xai_pTile3D inTile,
+                                                   xai_pArray meanArr,
+                                                   xai_pArray recipArr,
+                                                   const xai_pArray alphaArr,
+                                                   const xai_pArray betaArr,
+                                                   xai_pTile3D outTile,
+                                                   const xai_cnn_instance_norm_param * params);
+
+/*Channelwise Divide variants*/
+_XAI_API_ XAI_ERR_TYPE xaiDivide3D(const xai_pTile3D inTile,
+                                   const xai_pArray channelDivisor,
+                                   xai_pTile3D outTile,
+                                   const xai_cnn_divide3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiDivide3D_S8_WHD(const xai_pTile3D inTile,
+                                          const xai_pArray channelDivisor,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_divide3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiDivide3D_U8_WHD(const xai_pTile3D inTile,
+                                          const xai_pArray channelDivisor,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_divide3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiDivide3D_U8S8_WHD(const xai_pTile3D inTile,
+                                            const xai_pArray channelDivisor,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_divide3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiDivide3D_S16_WHD(const xai_pTile3D inTile,
+                                           const xai_pArray channelDivisor,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_divide3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiDivide3D_S16S8_WHD(const xai_pTile3D inTile,
+                                             const xai_pArray channelDivisor,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_divide3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiDivide3D_S8_DWH(const xai_pTile3D inTile,
+                                          const xai_pArray channelDivisor,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_divide3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiDivide3D_U8_DWH(const xai_pTile3D inTile,
+                                          const xai_pArray channelDivisor,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_divide3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiDivide3D_U8S8_DWH(const xai_pTile3D inTile,
+                                            const xai_pArray channelDivisor,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_divide3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiDivide3D_S16_DWH(const xai_pTile3D inTile,
+                                           const xai_pArray channelDivisor,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_divide3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiDivide3D_S16S8_DWH(const xai_pTile3D inTile,
+                                             const xai_pArray channelDivisor,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_divide3D_params *params);
+
+/* Crop and Resize variants */
+
+_XAI_API_ XAI_ERR_TYPE xaiCropResize3D(const xai_pTile3D inTile,
+                                       const xai_pArray ROIinfo,
+                                       xai_pTile4D outTile,
+                                       const xai_cnn_cropResize3D_params *params);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiCropResize3D_S8_DWH(const xai_pTile3D inTile,
+                                              const xai_pArray ROIinfo,
+                                              xai_pTile4D outTile,
+                                              const xai_cnn_cropResize3D_params *params);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiCropResize3D_U8_DWH(const xai_pTile3D inTile,
+                                              const xai_pArray ROIinfo,
+                                              xai_pTile4D outTile,
+                                              const xai_cnn_cropResize3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCropResize3D_S16_DWH(const xai_pTile3D inTile,
+                                               const xai_pArray ROIinfo,
+                                               xai_pTile4D outTile,
+                                               const xai_cnn_cropResize3D_params *params);
+
+/* ReduceSum3D variants */
+// -----------------------------------------------------------------------------------
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D(const xai_pTile3D inTile,
+                                      xai_pArray bufferArray,
+                                      xai_pTile3D outTile,
+                                      const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_S8(const xai_pTile3D inTile,
+                                         xai_pArray bufferArray,
+                                         xai_pTile3D outTile,
+                                         const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_S8U8(const xai_pTile3D inTile,
+                                           xai_pArray bufferArray,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_S8S16(const xai_pTile3D inTile,
+                                            xai_pArray bufferArray,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_U8(const xai_pTile3D inTile,
+                                         xai_pArray bufferArray,
+                                         xai_pTile3D outTile,
+                                         const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_U8S8(const xai_pTile3D inTile,
+                                           xai_pArray bufferArray,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_U8S16(const xai_pTile3D inTile,
+                                            xai_pArray bufferArray,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_S16(const xai_pTile3D inTile,
+                                          xai_pArray bufferArray,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_U16(const xai_pTile3D inTile,
+                                          xai_pArray bufferArray,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_S32(const xai_pTile3D inTile,
+                                          xai_pArray bufferArray,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_U32(const xai_pTile3D inTile,
+                                          xai_pArray bufferArray,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_reduce_params *params);
+// -----------------------------------------------------------------------------------
+/* ReduceSum4D variants */
+// -----------------------------------------------------------------------------------
+#ifndef GLOW_BUILD
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D(const xai_pTile4D inTile,
+                                      xai_pArray bufferArray,
+                                      xai_pTile4D outTile,
+                                      const xai_cnn_reduce_params *params);
+#endif
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_S8(const xai_pTile4D inTile,
+                                         xai_pArray bufferArray,
+                                         xai_pTile4D outTile,
+                                         const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_S8U8(const xai_pTile4D inTile,
+                                           xai_pArray bufferArray,
+                                           xai_pTile4D outTile,
+                                           const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_S8S16(const xai_pTile4D inTile,
+                                            xai_pArray bufferArray,
+                                            xai_pTile4D outTile,
+                                            const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_U8(const xai_pTile4D inTile,
+                                         xai_pArray bufferArray,
+                                         xai_pTile4D outTile,
+                                         const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_U8S8(const xai_pTile4D inTile,
+                                           xai_pArray bufferArray,
+                                           xai_pTile4D outTile,
+                                           const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_U8S16(const xai_pTile4D inTile,
+                                            xai_pArray bufferArray,
+                                            xai_pTile4D outTile,
+                                            const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_S16(const xai_pTile4D inTile,
+                                          xai_pArray bufferArray,
+                                          xai_pTile4D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_U16(const xai_pTile4D inTile,
+                                          xai_pArray bufferArray,
+                                          xai_pTile4D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_S32(const xai_pTile4D inTile,
+                                          xai_pArray bufferArray,
+                                          xai_pTile4D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_U32(const xai_pTile4D inTile,
+                                          xai_pArray bufferArray,
+                                          xai_pTile4D outTile,
+                                          const xai_cnn_reduce_params *params);
+// -----------------------------------------------------------------------------------
+_XAI_API_ XAI_ERR_TYPE xaiReduceMax3D(const xai_pTile3D inTile,
+                                      xai_pTile3D outTile,
+                                      const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMax3D_U8(const xai_pTile3D inTile,
+                                         xai_pTile3D outTile,
+                                         const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMax3D_S8(const xai_pTile3D inTile,
+                                         xai_pTile3D outTile,
+                                         const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMin3D_U8(const xai_pTile3D inTile,
+                                         xai_pTile3D outTile,
+                                         const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMin3D(const xai_pTile3D inTile,
+                                      xai_pTile3D outTile,
+                                      const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMin3D_S8(const xai_pTile3D inTile,
+                                         xai_pTile3D outTile,
+                                         const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMax3D_S16(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMin3D_S16(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMax3D_U16(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMin3D_U16(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMax3D_S32(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMin3D_S32(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMax3D_U32(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMin3D_U32(const xai_pTile3D inTile,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_reduce_params *params);
+// -----------------------------------------------------------------------------------
+#ifndef GLOW_BUILD
+_XAI_API_ XAI_ERR_TYPE xaiReduceMax4D(const xai_pTile4D inTile,
+                                      xai_pTile4D outTile,
+                                      const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMin4D(const xai_pTile4D inTile,
+                                      xai_pTile4D outTile,
+                                      const xai_cnn_reduce_params *params);
+#endif
+_XAI_API_ XAI_ERR_TYPE xaiReduceMax4D_U8(const xai_pTile4D inTile,
+                                         xai_pTile4D outTile,
+                                         const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMax4D_S8(const xai_pTile4D inTile,
+                                         xai_pTile4D outTile,
+                                         const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMin4D_U8(const xai_pTile4D inTile,
+                                         xai_pTile4D outTile,
+                                         const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMin4D_S8(const xai_pTile4D inTile,
+                                         xai_pTile4D outTile,
+                                         const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMax4D_S16(const xai_pTile4D inTile,
+                                          xai_pTile4D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMin4D_S16(const xai_pTile4D inTile,
+                                          xai_pTile4D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMax4D_U16(const xai_pTile4D inTile,
+                                          xai_pTile4D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMin4D_U16(const xai_pTile4D inTile,
+                                          xai_pTile4D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMax4D_S32(const xai_pTile4D inTile,
+                                          xai_pTile4D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMin4D_S32(const xai_pTile4D inTile,
+                                          xai_pTile4D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMax4D_U32(const xai_pTile4D inTile,
+                                          xai_pTile4D outTile,
+                                          const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMin4D_U32(const xai_pTile4D inTile,
+                                          xai_pTile4D outTile,
+                                          const xai_cnn_reduce_params *params);
+/* ReduceSAD3D variants */
+_XAI_API_ XAI_ERR_TYPE xaiReduceSAD3D(const xai_pTile3D inTile1,
+                                      const xai_pTile3D inTile2,
+                                      xai_pArray buffArr,
+                                      xai_pTile3D outTile,
+                                      const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSAD3D_S16UX(const xai_pTile3D inTile1,
+                                            const xai_pTile3D inTile2,
+                                            xai_pArray buffArr,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSAD3D_S8U16(const xai_pTile3D inTile1,
+                                            const xai_pTile3D inTile2,
+                                            xai_pArray buffArr,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSAD3D_S8U8(const xai_pTile3D inTile1,
+                                           const xai_pTile3D inTile2,
+                                           xai_pArray buffArr,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSAD3D_U8U16(const xai_pTile3D inTile1,
+                                            const xai_pTile3D inTile2,
+                                            xai_pArray buffArr,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSAD3D_U8(const xai_pTile3D inTile1,
+                                         const xai_pTile3D inTile2,
+                                         xai_pArray buffArr,
+                                         xai_pTile3D outTile,
+                                         const xai_cnn_reduce_params *params);
+
+//SVDF function
+_XAI_API_ XAI_ERR_TYPE svdf_S8I8(const xai_pTile3D inTile, xai_pTile3D stateTile,
+                                 const xai_pTile4D betaTile, const xai_pTile4D alphaTile,
+                                 xai_pTile3D scratchTile, const xai_pArray biasTile,
+                                 xai_pTile3D outTile, xai_cnn_svdf_params *svdfParams,
+                                 const xai_pArray outputScaleArray1,
+                                 const xai_pArray outputScaleArray2,
+                                 const xai_pArray fixUpBiasBuf);
+
+_XAI_API_ XAI_ERR_TYPE svdf_U8I8(const xai_pTile3D inTile, xai_pTile3D stateTile,
+                                 const xai_pTile4D betaTile, const xai_pTile4D alphaTile,
+                                 xai_pTile3D scratchTile, const xai_pArray biasTile,
+                                 xai_pTile3D outTile, xai_cnn_svdf_params *svdfParams,
+                                 const xai_pArray outputScaleArray1,
+                                 const xai_pArray outputScaleArray2,
+                                 const xai_pArray fixUpBiasBuf);
+
+_XAI_API_ XAI_ERR_TYPE xaiSvdf_VQ(const xai_pTile3D inTile, xai_pTile3D stateTile,
+                                  const xai_pTile4D betaTile, const xai_pTile4D alphaTile,
+                                  xai_pTile3D scratchTile, const xai_pArray biasTile,
+                                  xai_pTile3D outTile, xai_cnn_svdf_params *svdfParams,
+                                  const xai_pArray outputScaleArray1,
+                                  const xai_pArray outputScaleArray2,
+                                  const xai_pArray fixUpBiasBuf);
+
+//SVDF function
+_XAI_API_ XAI_ERR_TYPE svdfAligned(const xai_pTile3D inTile, xai_pTile3D stateTile,
+                                   const xai_pTile4D betaTile, const xai_pTile4D alphaTile,
+                                   xai_pTile3D scratchTile, const xai_pArray biasTile,
+                                   xai_pTile3D outTile, xai_cnn_svdf_params *svdfParams,
+                                   const xai_pArray outputScaleArray1,
+                                   const xai_pArray outputScaleArray2,
+                                   const xai_pArray fixUpBiasBuf);
+
+//SVDF function
+_XAI_API_ XAI_ERR_TYPE xaiSvdf_S8U8_VQ(const xai_pTile3D inTile, xai_pTile3D stateTile,
+                                       const xai_pTile4D betaTile, const xai_pTile4D alphaTile,
+                                       xai_pTile3D scratchTile, const xai_pArray biasArray,
+                                       xai_pTile3D outTile, xai_cnn_svdf_params *svdfParams,
+                                       const xai_pArray outputScaleArray1,
+                                       const xai_pArray outputScaleArray2,
+                                       const xai_pArray fixUpBiasBuf);
+
+_XAI_API_ XAI_ERR_TYPE xaiSvdf_U8U8_VQ(const xai_pTile3D inTile, xai_pTile3D stateTile,
+                                       const xai_pTile4D betaTile, const xai_pTile4D alphaTile,
+                                       xai_pTile3D scratchTile, const xai_pArray biasArray,
+                                       xai_pTile3D outTile, xai_cnn_svdf_params *svdfParams,
+                                       const xai_pArray outputScaleArray1,
+                                       const xai_pArray outputScaleArray2,
+                                       const xai_pArray fixUpBiasBuf);
+
+//SVDF function
+_XAI_API_ XAI_ERR_TYPE xaiSvdf_AS8AS8_VQ(const xai_pTile3D inTile, xai_pTile3D stateTile,
+                                         const xai_pTile4D betaTile, const xai_pTile4D alphaTile,
+                                         xai_pTile3D scratchTile, const xai_pArray biasArray,
+                                         xai_pTile3D outTile, xai_cnn_svdf_params *svdfParams,
+                                         const xai_pArray outputScaleArray1,
+                                         const xai_pArray outputScaleArray2,
+                                         const xai_pArray fixUpBiasBuf);
+
+/*************************************************************************************************/
+/* Quantize3D/4D (FP32 to fixed point) is declared in Fixed Point routines declaration           */
+/* as it can be used for the non AO or non FP32 support Hardwares via REF code inside the opt    */
+/*************************************************************************************************/
+
+
+_XAI_API_ XAI_ERR_TYPE xaiQuantize3D_F32U8(const xai_pTile3D inTile,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiQuantize3D_F32S8(const xai_pTile3D inTile,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiQuantize3D_F32S16(const xai_pTile3D inTile,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiQuantize4D_F32U8(const xai_pTile4D inTile,
+                                           xai_pTile4D outTile,
+                                           const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiQuantize4D_F32S8(const xai_pTile4D inTile,
+                                           xai_pTile4D outTile,
+                                           const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiQuantize4D_F32S16(const xai_pTile4D inTile,
+                                            xai_pTile4D outTile,
+                                            const xai_cnn_quantDequantA_params *pparams);
+
+/*************************************************************************************************/
+/**************************   END of Fixed Point routines declaration  ***************************/
+
+#if (XCHAL_HAVE_VISION_HP_VFPU == 1)
+/***************************************************************************************************/
+/******************************  FP16 routines declaration  ****************************************/
+/***************************************************************************************************/
+
+_XAI_API_ XAI_ERR_TYPE xaiBroadcastAddA3D_F16(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_eltwise_params * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBroadcastSub3D_F16(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_eltwise_params * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBroadcastEltwiseEqualA3D_F16(const xai_pTile3D inTile1,
+                                                       const xai_pTile3D inTile2,
+                                                       xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiBroadcastEltwiseNotEqualA3D_F16(const xai_pTile3D inTile1,
+                                                          const xai_pTile3D inTile2,
+                                                          xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiBroadcastMulA3D_F16(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_eltwiseMul_params * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedA3D_F16(const xai_pTile3D inTile,
+                                                const xai_pTile4D coeffTile,
+                                                const xai_pArray biasArray,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_conv_params * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedA3D2_F16(const xai_pTile3D inTile,
+                                                 const xai_pTile4D coeffTile,
+                                                 const xai_pArray biasArray,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_conv_params * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedA3D2_F16_FOLD8(const xai_pTile3D inTile,
+                                                       const xai_pTile4D coeffTile,
+                                                       const xai_pArray biasArray,
+                                                       xai_pTile3D outTile,
+                                                       const xai_cnn_conv_params * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedA3D2_F16_FOLD16(const xai_pTile3D inTile,
+                                                        const xai_pTile4D coeffTile,
+                                                        const xai_pArray biasArray,
+                                                        xai_pTile3D outTile,
+                                                        const xai_cnn_conv_params * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_F16(const xai_pTile4D inTile,
+                                                             const xai_pTile4D coeffTile,
+                                                             const xai_pArray biasArray,
+                                                             xai_pArray accArray,
+                                                             xai_pTile4D outTile,
+                                                             const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj1d1_F16_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            const xai_cnn_conv_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1j1d1_F16_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            const xai_cnn_conv_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_2x2j1d1_F16_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j1d1_F16_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j2d1_F16_MOW_WHD(const xai_pTile3D inTile,
+                                                            const xai_pTile4D coeffTile,
+                                                            const xai_pArray biasArray,
+                                                            xai_pTile3D outTile,
+                                                            const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_MXN_F16Ca2_MOD_DWH(const xai_pTile3D inTile,
+                                                         const xai_pTile4D coeffTile,
+                                                         const xai_pArray biasArray,
+                                                         xai_pTile3D outTile,
+                                                         const xai_cnn_conv_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_1X1_F16Ca2_MOD_DWH(const xai_pTile3D inTile,
+                                                         const xai_pTile4D coeffTile,
+                                                         const xai_pArray biasArray,
+                                                         xai_pTile3D outTile,
+                                                         const xai_cnn_conv_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_1X1_F16Ca2_MOD_WHD_DWH(const xai_pTile3D inTile,
+                                                             const xai_pTile4D coeffTile,
+                                                             const xai_pArray biasArray,
+                                                             xai_pTile3D outTile,
+                                                             const xai_cnn_conv_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiPartialConvolved3D_MXN_F16Ca2_MOD_DWH(const xai_pTile3D inTile,
+                                                                const xai_pTile4D coeffTile,
+                                                                const xai_pArray biasArray,
+                                                                xai_pTile3D outTile,
+                                                                const xai_cnn_conv_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_MxN_F16_MOD_DWH(const xai_pTile3D inTile,
+                                                                 const xai_pTile3D coeffTile,
+                                                                 const xai_pArray biasArray,
+                                                                 xai_pTile3D outTile,
+                                                                 const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE  xaiDepthwiseConvolved2D_S_MxNj1d2_F16_MOW_WHD(const xai_pTile3D inTile,
+                                                                      const xai_pTile3D coeffTile,
+                                                                      const xai_pArray biasArray,
+                                                                      xai_pTile3D outTile,
+                                                                      const xai_cnn_depthwiseDilatedConv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseLOGA3D_F16(const xai_pTile3D inTile,
+                                            xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiExp3D_F16(const xai_pTile3D inTile,
+                                    xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMaxA3D_F16(const xai_pTile3D inTile,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_reduce_params *params);
+_XAI_API_ XAI_ERR_TYPE xaiReduceMinA3D_F16(const xai_pTile3D inTile,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMaxA4D_F16(const xai_pTile4D inTile,
+                                           xai_pTile4D outTile,
+                                           const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMinA4D_F16(const xai_pTile4D inTile,
+                                           xai_pTile4D outTile,
+                                           const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSumA3D_F16(const xai_pTile3D inTile,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceSumA4D_F16(const xai_pTile4D inTile,
+                                           xai_pTile4D outTile,
+                                           const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMeanA3D_F16(const xai_pTile3D inTile,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceMeanA4D_F16(const xai_pTile4D inTile,
+                                            xai_pArray intermediateArray,
+                                            xai_pTile4D outTile,
+                                            const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceProdA3D_F16(const xai_pTile3D inTile,
+                                            xai_pArray intermediateArray,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiReduceProdA4D_F16(const xai_pTile4D inTile,
+                                            xai_pArray intermediateArray,
+                                            xai_pTile4D outTile,
+                                            const xai_cnn_reduce_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiQuantizeA3D_F16U8(const xai_pTile3D inTile,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiQuantizeA3D_F16S8(const xai_pTile3D inTile,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiQuantize3D_F16S16(const xai_pTile3D inTile,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiQuantizeA4D_F16U8(const xai_pTile4D inTile,
+                                            xai_pTile4D outTile,
+                                            const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiQuantizeA4D_F16S8(const xai_pTile4D inTile,
+                                            xai_pTile4D outTile,
+                                            const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiQuantize4D_F16S16(const xai_pTile4D inTile,
+                                            xai_pTile4D outTile,
+                                            const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeA3D_U8F16(const xai_pTile3D inTile,
+                                              xai_pTile3D outTile,
+                                              xai_pArray lut,
+                                              const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeA3D_S8F16(const xai_pTile3D inTile,
+                                              xai_pTile3D outTile,
+                                              xai_pArray lut,
+                                              const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantize3D_S16F16(const xai_pTile3D inTile,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeA4D_U8F16(const xai_pTile4D inTile,
+                                              xai_pTile4D outTile,
+                                              xai_pArray lut,
+                                              const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeA4D_S8F16(const xai_pTile4D inTile,
+                                              xai_pTile4D outTile,
+                                              xai_pArray lut,
+                                              const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantize4D_S16F16(const xai_pTile4D inTile,
+                                              xai_pTile4D outTile,
+                                              const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeAVQ3D_S8F16(const xai_pTile3D inTile,
+                                                xai_pTile3D outTile,
+                                                xai_pArray outScaleArray,
+                                                const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeAVQ4D_S8F16(const xai_pTile4D inTile,
+                                                xai_pTile4D outTile,
+                                                xai_pArray outScaleArray,
+                                                const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiSqrtA3D_F16(const xai_pTile3D inTile,
+                                      xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiRSqrtA3D_F16(const xai_pTile3D inTile,
+                                       xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwisePOWA3D_F16(const xai_pTile3D baseTile,
+                                            const xai_pTile3D exponentTile,
+                                            xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiBroadcastEltwisePOWA3D_F16(const xai_pTile3D baseTile,
+                                                     const xai_pTile3D exponentTile,
+                                                     xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseFLOORA3D_F16(const xai_pTile3D inTile,
+                                              xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseCEILA3D_F16(const xai_pTile3D inTile,
+                                             xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseROUNDA3D_F16(const xai_pTile3D inTile,
+                                              xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiDivA3D_F16(const xai_pTile3D numeratorTile,
+                                     const xai_pTile3D denominatorTile,
+                                     xai_pTile3D outTile,
+                                     const xai_cnn_eltwise_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiBroadcastDivA3D_F16(const xai_pTile3D numeratorTile,
+                                              const xai_pTile3D denominatorTile,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_eltwise_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftMaxA3D_F16(const xai_pTile3D inTile,
+                                         xai_pTile3D outTile,
+                                         xai_cnn_softmaxA3D_F16_params * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftMaxA3D_dim1_F16(const xai_pTile3D inTile,
+                                              xai_pTile3D outTile,
+                                              xai_cnn_softmaxA3D_F16_params * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftMaxA3D_dim2_F16(const xai_pTile3D inTile,
+                                              xai_pTile3D outTile,
+                                              xai_cnn_softmaxA3D_F16_params * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSoftMaxA3D_dim3_F16(const xai_pTile3D inTile,
+                                              xai_pTile3D outTile,
+                                              xai_cnn_softmaxA3D_F16_params * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiLogSoftMaxA3D_F16(const xai_pTile3D inTile,
+                                            xai_pTile3D outTile,
+                                            xai_cnn_softmaxA3D_F16_params * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiSigmoidA3D_F16(const xai_pTile3D inTile,
+                                         xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiTanh3D_F16(const xai_pTile3D inTile,
+                                     xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_3x3_F16_DWH(const xai_pTile3D inTile,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_3x3_F16_DWH(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param,
+                                                 const xai_size3D frame3DSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_MxN_F16_DWH(const xai_pTile3D inTile,
+                                                 xai_pTile3D outTile,
+                                                 const xai_cnn_pooling_params *param,
+                                                 const xai_size3D frame3DSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_F16_DWH(const xai_pTile3D inTile,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxN_F16_DWH(const xai_pTile3D inTile,
+                                                xai_pTile3D outTile,
+                                                const xai_cnn_pooling_params * param);
+
+_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxN_F16_DWH(const xai_pTile3D inTile,
+                                                       xai_pTile3D outTile,
+                                                       xai_pTile3D idxTile,
+                                                       const xai_cnn_pooling_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiExtendEdges3D_F16(xai_pTile3D dstTile,
+                                            const xai_pArray pArray,
+                                            xai_size3D frame3DSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiExtendEdgesConst3D_F16(xai_pTile3D dstTile,
+                                                 const xb_f16 value,
+                                                 xai_size3D frame3DSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiFillTile3D_F16(xai_pTile3D dstTile,
+                                         const xb_f16 value,
+                                         xai_bool fill_edge_extension);
+
+/*_XAI_API_ XAI_ERR_TYPE xaiBiasExtend_F16_MOD(const xai_pArray inBiasArray,
+                                             xai_pArray outBiasArray);*/
+
+/*_XAI_API_ XAI_ERR_TYPE xaiOutScaleExtend_F16_MOD(const xai_pArray outScaleArray,
+                                                 xai_pArray extendedOutScaleArray);*/
+
+/*_XAI_API_ XAI_ERR_TYPE xaiDeConvReOrder4D_F16_NDWH(const xai_pTile4D inTile,
+                                                   xai_pTile4D subCoeffs[],
+                                                   xai_pTile4D superCoeffs[],
+                                                   const xai_cnn_conv_params *param,
+                                                   const uint8_t transposeCoeffsFlag);*/
+
+_XAI_API_ XAI_ERR_TYPE xaiResize3D_SetTileParams(const xai_size3D *inFrame3DSize,
+                                                 const xai_size3D *outFrame3DSize,
+                                                 const xai_cnn_data_order dataOrder,
+                                                 int32_t half_pixel_flag,
+                                                 xai_cnn_resizeA3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiInterp3D_F16_SetTileParams(const xai_size3D *inFrame3DSize,
+                                                     const xai_size3D *outFrame3DSize,
+                                                     const xai_cnn_data_order dataOrder,
+                                                     int32_t half_pixel_flag,
+                                                     xai_cnn_interp3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiInterp3D_F16_DWH(const xai_pTile3D inTile,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_interp3D_params * pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_F16_DWH(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_resize_nearest3D_params * pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_F16_WHD(const xai_pTile3D inTile,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_resize_nearest3D_params *params);
+
+/*hardSwish FP16*/
+_XAI_API_ XAI_ERR_TYPE xaiHardSwish_F16(const xai_pTile3D inTile,
+                                        xai_pTile3D outTile);
+/*ArgMin ArgMax*/
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_F16_dim1(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numLargestVal);
+
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_F16_dim2(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numLargestVal);
+
+_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_F16_dim3(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numLargestVal);
+
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_F16_dim1(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numSmallestVal);
+
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_F16_dim2(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numSmallestVal);
+
+_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_F16_dim3(const xai_pTile3D inTile,
+                                            xai_pArray bufArray,
+                                            xai_pTile3D outTileIdx,
+                                            xai_pTile3D outTileVal,
+                                            const uint16_t numSmallestVal);
+
+_XAI_API_ XAI_ERR_TYPE xaiMergeTopKArgmax3D_F16_dim1(const xai_pTile3D inTileIdx,
+                                                     const xai_pTile3D inTileVal,
+                                                     const xai_pArray inPtrOffsetArr,
+                                                     xai_pArray bufArray,
+                                                     xai_pTile3D outTileIdx,
+                                                     xai_pTile3D outTileVal,
+                                                     const uint16_t numLargestVal);
+
+_XAI_API_ XAI_ERR_TYPE xaiMergeTopKArgmin3D_F16_dim1(const xai_pTile3D inTileIdx,
+                                                     const xai_pTile3D inTileVal,
+                                                     const xai_pArray inPtrOffsetArr,
+                                                     xai_pArray bufArray,
+                                                     xai_pTile3D outTileIdx,
+                                                     xai_pTile3D outTileVal,
+                                                     const uint16_t numSmallestVal);
+
+/*prelu FP16*/
+_XAI_API_ XAI_ERR_TYPE xaiPRELU3D_F16(const xai_pTile3D inTile,
+                                      const xai_pTile3D slopeArray,
+                                      xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiPRELU3D_F16_DWH(const xai_pTile3D inTile,
+                                          const xai_pTile3D slopeArray,
+                                          xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiPRELU3D_F16_WHD(const xai_pTile3D inTile,
+                                          const xai_pTile3D slopeArray,
+                                          xai_pTile3D outTile);
+
+/*Leaky relu F16*/
+_XAI_API_ XAI_ERR_TYPE xaiLeakyRELU_F16(const xai_pTile3D inTile,
+                                        xai_pTile3D outTile,
+                                        const xb_f16 slope);
+
+/* LUT APIs */
+
+_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Oddsym_F16(const xai_pTile3D inTile,
+                                           const xai_pArray lutArray,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_lut_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Evensym_F16(const xai_pTile3D inTile,
+                                            const xai_pArray lutArray,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_lut_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Normal_F16(const xai_pTile3D inTile,
+                                           const xai_pArray lutArray,
+                                           xai_pTile3D outTile,
+                                           const xai_cnn_lut_params *params);
+
+/*Depthwise Conv F16*/
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxN_F16_MOD_DWH(const xai_pTile3D inTile,
+                                                                const xai_pTile3D coeffTile,
+                                                                const xai_pArray biasArray,
+                                                                xai_pTile3D outTile,
+                                                                const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj1_F16_MOW_WHD(const xai_pTile3D inTile,
+                                                                  const xai_pTile3D coeffTile,
+                                                                  const xai_pArray biasArray,
+                                                                  xai_pTile3D outTile,
+                                                                  const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj2_F16_MOW_WHD(const xai_pTile3D inTile,
+                                                                  const xai_pTile3D coeffTile,
+                                                                  const xai_pArray biasArray,
+                                                                  xai_pTile3D outTile,
+                                                                  const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3j1_F16_MOW_WHD(const xai_pTile3D inTile,
+                                                                  const xai_pTile3D coeffTile,
+                                                                  const xai_pArray biasArray,
+                                                                  xai_pTile3D outTile,
+                                                                  const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3j2_F16_MOW_WHD(const xai_pTile3D inTile,
+                                                                  const xai_pTile3D coeffTile,
+                                                                  const xai_pArray biasArray,
+                                                                  xai_pTile3D outTile,
+                                                                  const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5j1_F16_MOW_WHD(const xai_pTile3D inTile,
+                                                                  const xai_pTile3D coeffTile,
+                                                                  const xai_pArray biasArray,
+                                                                  xai_pTile3D outTile,
+                                                                  const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5j2_F16_MOW_WHD(const xai_pTile3D inTile,
+                                                                  const xai_pTile3D coeffTile,
+                                                                  const xai_pArray biasArray,
+                                                                  xai_pTile3D outTile,
+                                                                  const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3_F16Ca2_MOD_DWH(const xai_pTile3D inTile,
+                                                                   const xai_pTile3D coeffTile,
+                                                                   const xai_pArray biasArray,
+                                                                   xai_pTile3D outTile,
+                                                                   const xai_cnn_conv_params *param);
+
+_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5_F16Ca2_MOD_DWH(const xai_pTile3D inTile,
+                                                                   const xai_pTile3D coeffTile,
+                                                                   const xai_pArray biasArray,
+                                                                   xai_pTile3D outTile,
+                                                                   const xai_cnn_conv_params *param);
+
+/*Batchnorm f16*/
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_F16_DWH(const xai_pTile3D inTile,
+                                              const xai_pArray Alpha,
+                                              const xai_pArray Beta,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_batchnorm_params *params);
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_F16_WHD(const xai_pTile3D inTile,
+                                              const xai_pArray Alpha,
+                                              const xai_pArray Beta,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_batchnorm_params *params);
+_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_F16(const xai_pTile3D inTile,
+                                          const xai_pArray Alpha,
+                                          const xai_pArray Beta,
+                                          xai_pTile3D outTile,
+                                          const xai_cnn_batchnorm_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_f16_WHD(const xai_pTile3D inTile,
+                                                           xai_pArray meanArr,
+                                                           xai_pArray recipArr,
+                                                           xai_pArray buffArr,
+                                                           xai_pArray buffArrSoS,
+                                                           const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_f16_DWH(const xai_pTile3D inTile,
+                                                           xai_pArray meanArr,
+                                                           xai_pArray recipArr,
+                                                           xai_pArray buffArr,
+                                                           xai_pArray buffArrSoS,
+                                                           const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_F16_Dim1(const xai_pTile3D inTile,
+                                                            xai_pArray meanArr,
+                                                            xai_pArray recipArr,
+                                                            xai_pArray buffArr,
+                                                            xai_pArray buffArrSoS,
+                                                            const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_F16_Dim2(const xai_pTile3D inTile,
+                                                            xai_pArray meanArr,
+                                                            xai_pArray recipArr,
+                                                            xai_pArray buffArr,
+                                                            xai_pArray buffArrSoS,
+                                                            const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_F16_Dim3(const xai_pTile3D inTile,
+                                                            xai_pArray meanArr,
+                                                            xai_pArray recipArr,
+                                                            xai_pArray buffArr,
+                                                            xai_pArray buffArrSoS,
+                                                            const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_F16(const xai_pTile3D inTile,
+                                                       xai_pArray meanArr,
+                                                       xai_pArray recipArr,
+                                                       xai_pArray buffArr,
+                                                       xai_pArray buffArrSoS,
+                                                       const xai_cnn_instance_norm_param * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_F16_Dim1(xai_pTile3D inTile,
+                                                       xai_pArray meanArr,
+                                                       xai_pArray recipArr,
+                                                       const xai_pArray alphaArr,
+                                                       const xai_pArray betaArr,
+                                                       xai_pTile3D outTile,
+                                                       const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_F16_Dim2(xai_pTile3D inTile,
+                                                       xai_pArray meanArr,
+                                                       xai_pArray recipArr,
+                                                       const xai_pArray alphaArr,
+                                                       const xai_pArray betaArr,
+                                                       xai_pTile3D outTile,
+                                                       const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_F16_Dim3(xai_pTile3D inTile,
+                                                       xai_pArray meanArr,
+                                                       xai_pArray recipArr,
+                                                       const xai_pArray alphaArr,
+                                                       const xai_pArray betaArr,
+                                                       xai_pTile3D outTile,
+                                                       const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_F16(xai_pTile3D inTile,
+                                                  xai_pArray meanArr,
+                                                  xai_pArray recipArr,
+                                                  const xai_pArray alphaArr,
+                                                  const xai_pArray betaArr,
+                                                  xai_pTile3D outTile,
+                                                  const xai_cnn_instance_norm_param *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_F16_WHD(const xai_pTile3D inTile,
+                                                        xai_pArray buffArrSoS,
+                                                        xai_pArray pNormScaleArr,
+                                                        const xai_cnn_normalize3D_params * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_F16_DWH(const xai_pTile3D inTile,
+                                                        xai_pArray buffArrSoS,
+                                                        xai_pArray pNormScaleArr,
+                                                        const xai_cnn_normalize3D_params * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_F16(const xai_pTile3D inTile,
+                                                    xai_pArray buffArrSoS,
+                                                    xai_pArray pNormScaleArr,
+                                                    const xai_cnn_normalize3D_params * params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_F16_WHD(xai_pTile3D inTile,
+                                               const xai_pArray pNormScaleArr,
+                                               xai_pTile3D pOutTile,
+                                               const xai_cnn_normalize3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_F16_DWH(xai_pTile3D inTile,
+                                               const xai_pArray pNormScaleArr,
+                                               xai_pTile3D pOutTile,
+                                               const xai_cnn_normalize3D_params *params);
+
+_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_F16(xai_pTile3D inTile,
+                                           const xai_pArray pNormScaleArr,
+                                           xai_pTile3D pOutTile,
+                                           const xai_cnn_normalize3D_params *params);
+
+/****************************  END of FP16 routines declaration  ************************************/
+#endif // end of #if (XCHAL_HAVE_VISION_HP_VFPU == 1)
+
+
+#if (XCHAL_HAVE_VISION_SP_VFPU == 1) && (XCHAL_HAVE_VISION_HP_VFPU == 1)
+/***************************************************************************************************/
+/******************************  Mixed FP16/FP32 routines declaration  *****************************/
+/***************************************************************************************************/
+
+_XAI_API_ XAI_ERR_TYPE xaiQuantize3D_F32F16(const xai_pTile3D inTile,
+                                            xai_pTile3D outTile,
+                                            const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiQuantize4D_F32F16(const xai_pTile4D inTile,
+                                            xai_pTile4D outTile,
+                                            const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantize3D_F16F32(const xai_pTile3D inTile,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantize4D_F16F32(const xai_pTile4D inTile,
+                                              xai_pTile4D outTile,
+                                              const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeVQ3D_F16F32(const xai_pTile3D inTile,
+                                                xai_pTile3D outTile,
+                                                xai_pArray outScaleArray,
+                                                const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeVQ4D_F16F32(const xai_pTile4D inTile,
+                                                xai_pTile4D outTile,
+                                                xai_pArray outScaleArray,
+                                                const xai_cnn_quantDequantA_params *pparams);
+
+/****************************  END of Mixed FP16/FP32 routines declaration  *************************/
+#endif //end of #if (XCHAL_HAVE_VISION_SP_VFPU == 1) && (XCHAL_HAVE_VISION_HP_VFPU == 1)
+
+
+#if (XCHAL_HAVE_VISION_SP_VFPU == 1)
+/***************************************************************************************************/
+/******************************  FP32 routines declaration  ****************************************/
+/***************************************************************************************************/
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantize3D_U8F32(const xai_pTile3D inTile,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantize3D_S8F32(const xai_pTile3D inTile,
+                                             xai_pTile3D outTile,
+                                             const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantize3D_S16F32(const xai_pTile3D inTile,
+                                              xai_pTile3D outTile,
+                                              const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantize4D_U8F32(const xai_pTile4D inTile,
+                                             xai_pTile4D outTile,
+                                             const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantize4D_S8F32(const xai_pTile4D inTile,
+                                             xai_pTile4D outTile,
+                                             const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantize4D_S16F32(const xai_pTile4D inTile,
+                                              xai_pTile4D outTile,
+                                              const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeVQ3D_S8F32(const xai_pTile3D inTile,
+                                               xai_pTile3D outTile,
+                                               xai_pArray outScaleArray,
+                                               const xai_cnn_quantDequantA_params *pparams);
+
+_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeVQ4D_S8F32(const xai_pTile4D inTile,
+                                               xai_pTile4D outTile,
+                                               xai_pArray outScaleArray,
+                                               const xai_cnn_quantDequantA_params *pparams);
+
+
+
+_XAI_API_ XAI_ERR_TYPE xaiExtendEdges3D_F32(xai_pTile3D dstTile,
+                                            const xai_pArray pArray,
+                                            xai_size3D frame3DSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiExtendEdgesConst3D_F32(xai_pTile3D dstTile,
+                                                 const float value,
+                                                 xai_size3D frame3DSize);
+
+_XAI_API_ XAI_ERR_TYPE xaiFillTile3D_F32(xai_pTile3D dstTile,
+                                         const float value,
+                                         xai_bool fill_edge_extension);
+/****************************  END of FP32 routines declaration  ************************************/
+#endif //end of #if (XCHAL_HAVE_VISION_SP_VFPU == 1)
+#endif //if ((XCHAL_VISION_TYPE >= 6))
+#endif // #ifndef __XAI_CNN_API_H__
diff --git a/backends/cadence/vision/third-party/libxai/include/xai_intrin.h b/backends/cadence/vision/third-party/libxai/include/xai_intrin.h
new file mode 100644
index 00000000000..a2c2aa12328
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai/include/xai_intrin.h
@@ -0,0 +1,1077 @@
+/*
+ * Copyright (c) 2013-2018 Tensilica Inc. ALL RIGHTS RESERVED.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __XAI_INTRIN_H__
+#define __XAI_INTRIN_H__
+
+#include <xtensa/tie/xt_ivpn.h>
+#include <xtensa/tie/xt_misc.h>
+
+#ifndef XCHAL_HAVE_VISION
+#  define XCHAL_HAVE_VISION  0
+#endif
+
+#if ((XCHAL_HW_REL_LX8 == 1) && (XCHAL_VISION_SIMD16 == 32))
+#define IS_VISION_130
+#endif
+
+////////// CSTUBS workarounds
+
+#if defined(_MSC_VER) && !XCHAL_HAVE_VISION
+#   undef IVP_ABSSUBNX16
+#   define IVP_ABSSUBNX16(a, b)  IVP_MAXNX16(IVP_SUBNX16(b, a), IVP_SUBNX16(a, b))
+#endif
+
+#if !defined(__XCC__) && !XCHAL_HAVE_VISION
+typedef vselN  _xai_intrin_private_xb_vselN;
+#   undef IVP_SQZN
+#   define IVP_SQZN(a, b, c)    do { _xai_intrin_private_xb_vselN _sqzntmp; CSTUB_(_TIE_xt_ivp32_IVP_SQZN) (_sqzntmp, b, c); a = _sqzntmp; } while (0)
+#   undef IVP_UNSQZN
+#   define IVP_UNSQZN(a, b, c)  do { _xai_intrin_private_xb_vselN _sqzntmp; CSTUB_(_TIE_xt_ivp32_IVP_UNSQZN) (_sqzntmp, b, c); a = _sqzntmp; } while (0)
+#endif
+
+#if !defined(__XCC__) && XCHAL_HAVE_VISION
+#if 0
+#undef IVP_SCATTERNX8U
+#undef IVP_SCATTERNX8UT
+#define IVP_SCATTERNX8U(val__, ptr__, offs__)                        \
+  {                                                                  \
+    vboolN mask       = IVP_LTNX16(0, 1);                            \
+    xb_vecNx16 mask16 = IVP_MOVNX16T(1, 0, mask);                    \
+    xb_vecNx16U offs1 = (offs__);                                    \
+    xb_vecNx16 val1   = val__;                                       \
+    for (int i = 0; i < 32; i++)                                     \
+    {                                                                \
+      int v = IVP_MOVAVU16(val1);                                    \
+      int o = IVP_MOVAVU16(offs1);                                   \
+      int m = IVP_MOVAVU16(mask16);                                  \
+      if (m) { *((uint8_t *) (ptr__) + o) = v; }                     \
+      val1   = IVP_SELNX16I(0, val1, IVP_SELI_16B_ROTATE_RIGHT_1);   \
+      mask16 = IVP_SELNX16I(0, mask16, IVP_SELI_16B_ROTATE_RIGHT_1); \
+      offs1  = IVP_SELNX16I(0, offs1, IVP_SELI_16B_ROTATE_RIGHT_1);  \
+    }                                                                \
+  }
+
+#define IVP_SCATTERNX8UT(val__, ptr__, offs__, mask__)               \
+  {                                                                  \
+    xb_vecNx16 mask16 = IVP_MOVNX16T(1, 0, (mask__));                \
+    xb_vecNx16 val    = (val__);                                     \
+    xb_vecNx16 off    = (offs__);                                    \
+    for (int i = 0; i < 32; i++)                                     \
+    {                                                                \
+      int v = IVP_MOVAVU16(val);                                     \
+      int o = IVP_MOVAVU16(off);                                     \
+      int m = IVP_MOVAVU16(mask16);                                  \
+      if (m) { *(((uint8_t *) ptr__) + o) = v; }                     \
+      val    = IVP_SELNX16I(0, val, IVP_SELI_16B_ROTATE_RIGHT_1);    \
+      mask16 = IVP_SELNX16I(0, mask16, IVP_SELI_16B_ROTATE_RIGHT_1); \
+      off    = IVP_SELNX16I(0, off, IVP_SELI_16B_ROTATE_RIGHT_1);    \
+    }                                                                \
+  }
+
+#undef IVP_SCATTERN_2X32
+#undef IVP_SCATTERN_2X32T
+#define IVP_SCATTERN_2X32(val__, ptr__, offs__)                                                                      \
+  {                                                                                                                  \
+    vboolN_2 mask        = IVP_LTN_2X32(0, 1);                                                                       \
+    xb_vecN_2x32v mask32 = IVP_MOVN_2X32T(1, 0, mask);                                                               \
+    xb_vecN_2x32v offs1  = IVP_SRLIN_2X32(offs__, 2);                                                                \
+    xb_vecN_2x32v val1   = val__;                                                                                    \
+    for (int i = 0; i < 16; i++)                                                                                     \
+    {                                                                                                                \
+      int v = IVP_MOVAV32(val1);                                                                                     \
+      int o = IVP_MOVAV32(offs1);                                                                                    \
+      int m = IVP_MOVAV32(mask32);                                                                                   \
+      if (m) { *((ptr__) + o) = v; }                                                                                 \
+      val1   = IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(val1), IVP_SELI_32B_ROTATE_RIGHT_1));   \
+      mask32 = IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(mask32), IVP_SELI_32B_ROTATE_RIGHT_1)); \
+      offs1  = IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(offs1), IVP_SELI_32B_ROTATE_RIGHT_1));  \
+    }                                                                                                                \
+  }
+
+#define IVP_SCATTERN_2X32T(val__, ptr__, offs__, mask__)                                                             \
+  {                                                                                                                  \
+    xb_vecN_2x32v mask32 = IVP_MOVN_2X32T(1, 0, mask__);                                                             \
+    xb_vecN_2x32v offs1  = IVP_SRLIN_2X32(offs__, 2);                                                                \
+    xb_vecN_2x32v val1   = val__;                                                                                    \
+    for (int i = 0; i < 16; i++)                                                                                     \
+    {                                                                                                                \
+      int v = IVP_MOVAV32(val1);                                                                                     \
+      int o = IVP_MOVAV32(offs1);                                                                                    \
+      int m = IVP_MOVAV32(mask32);                                                                                   \
+      if (m) { *((ptr__) + o) = v; }                                                                                 \
+      val1   = IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(val1), IVP_SELI_32B_ROTATE_RIGHT_1));   \
+      mask32 = IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(mask32), IVP_SELI_32B_ROTATE_RIGHT_1)); \
+      offs1  = IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(offs1), IVP_SELI_32B_ROTATE_RIGHT_1));  \
+    }                                                                                                                \
+  }
+
+#undef IVP_SCATTER2NX8U_L
+#undef IVP_SCATTER2NX8UT_L
+#define IVP_SCATTER2NX8U_L(val__, ptr__, offs__)                   \
+  {                                                                \
+    vbool2N mask      = IVP_LT2NX8(0, 1);                          \
+    xb_vec2Nx8 mask8  = IVP_MOV2NX8T(1, 0, mask);                  \
+    xb_vecNx16U offs1 = (offs__);                                  \
+    xb_vec2Nx8 val1   = val__;                                     \
+    for (int i = 0; i < 32; i++)                                   \
+    {                                                              \
+      int v = IVP_MOVAVU8(val1);                                   \
+      int o = IVP_MOVAVU16(offs1);                                 \
+      int m = IVP_MOVAVU8(mask8);                                  \
+      if (m) { *((uint8_t *) (ptr__) + o) = v; }                   \
+      val1  = IVP_SEL2NX8I(0, val1, IVP_SELI_8B_ROTATE_RIGHT_1);   \
+      mask8 = IVP_SEL2NX8I(0, mask8, IVP_SELI_8B_ROTATE_RIGHT_1);  \
+      offs1 = IVP_SELNX16I(0, offs1, IVP_SELI_16B_ROTATE_RIGHT_1); \
+    }                                                              \
+  }
+
+#define IVP_SCATTER2NX8UT_L(val__, ptr__, offs__, mask__)          \
+  {                                                                \
+    vbool2N mask      = mask__;                                    \
+    xb_vec2Nx8 mask8  = IVP_MOV2NX8T(1, 0, mask);                  \
+    xb_vecNx16U offs1 = (offs__);                                  \
+    xb_vec2Nx8 val1   = val__;                                     \
+    for (int i = 0; i < 32; i++)                                   \
+    {                                                              \
+      int v = IVP_MOVAVU8(val1);                                   \
+      int o = IVP_MOVAVU16(offs1);                                 \
+      int m = IVP_MOVAVU8(mask8);                                  \
+      if (m) { *((uint8_t *) (ptr__) + o) = v; }                   \
+      val1  = IVP_SEL2NX8I(0, val1, IVP_SELI_8B_ROTATE_RIGHT_1);   \
+      mask8 = IVP_SEL2NX8I(0, mask8, IVP_SELI_8B_ROTATE_RIGHT_1);  \
+      offs1 = IVP_SELNX16I(0, offs1, IVP_SELI_16B_ROTATE_RIGHT_1); \
+    }                                                              \
+  }
+
+#undef IVP_SCATTER2NX8U_H
+#undef IVP_SCATTER2NX8UT_H
+#define IVP_SCATTER2NX8U_H(val__, ptr__, offs__)                   \
+  {                                                                \
+    vbool2N mask      = IVP_LT2NX8(0, 1);                          \
+    xb_vec2Nx8 mask8  = IVP_MOV2NX8T(1, 0, mask);                  \
+    xb_vecNx16U offs1 = (offs__);                                  \
+    xb_vec2Nx8 val1   = val__;                                     \
+                                                                   \
+    val1  = IVP_SEL2NX8I(0, val1, IVP_SELI_8B_ROTATE_RIGHT_32);    \
+    mask8 = IVP_SEL2NX8I(0, mask8, IVP_SELI_8B_ROTATE_RIGHT_32);   \
+    for (int i = 0; i < 32; i++)                                   \
+    {                                                              \
+      int v = IVP_MOVAVU8(val1);                                   \
+      int o = IVP_MOVAVU16(offs1);                                 \
+      int m = IVP_MOVAVU8(mask8);                                  \
+      if (m) { *((uint8_t *) (ptr__) + o) = v; }                   \
+      val1  = IVP_SEL2NX8I(0, val1, IVP_SELI_8B_ROTATE_RIGHT_1);   \
+      mask8 = IVP_SEL2NX8I(0, mask8, IVP_SELI_8B_ROTATE_RIGHT_1);  \
+      offs1 = IVP_SELNX16I(0, offs1, IVP_SELI_16B_ROTATE_RIGHT_1); \
+    }                                                              \
+  }
+
+#define IVP_SCATTER2NX8UT_H(val__, ptr__, offs__, mask__)          \
+  {                                                                \
+    vbool2N mask      = mask__;                                    \
+    xb_vec2Nx8 mask8  = IVP_MOV2NX8T(1, 0, mask);                  \
+    xb_vecNx16U offs1 = (offs__);                                  \
+    xb_vec2Nx8 val1   = val__;                                     \
+                                                                   \
+    val1  = IVP_SEL2NX8I(0, val1, IVP_SELI_8B_ROTATE_RIGHT_32);    \
+    mask8 = IVP_SEL2NX8I(0, mask8, IVP_SELI_8B_ROTATE_RIGHT_32);   \
+    for (int i = 0; i < 32; i++)                                   \
+    {                                                              \
+      int v = IVP_MOVAVU8(val1);                                   \
+      int o = IVP_MOVAVU16(offs1);                                 \
+      int m = IVP_MOVAVU8(mask8);                                  \
+      if (m) { *((uint8_t *) (ptr__) + o) = v; }                   \
+      val1  = IVP_SEL2NX8I(0, val1, IVP_SELI_8B_ROTATE_RIGHT_1);   \
+      mask8 = IVP_SEL2NX8I(0, mask8, IVP_SELI_8B_ROTATE_RIGHT_1);  \
+      offs1 = IVP_SELNX16I(0, offs1, IVP_SELI_16B_ROTATE_RIGHT_1); \
+    }                                                              \
+  }
+
+#undef IVP_GATHERNX8UT_V
+#define IVP_GATHERNX8UT_V(pdst, offs, mask, dly)  IVP_MOVNX16T(IVP_GATHERNX8U_V((pdst), (offs), (dly)), 0, mask)
+
+#undef IVP_GATHERNX16T_V
+#define IVP_GATHERNX16T_V(pdst, offs, mask, dly)  IVP_MOVNX16T(IVP_GATHERNX16_V((pdst), (offs), (dly)), 0, mask)
+
+#undef IVP_GATHERN_2X32T_V
+#define IVP_GATHERN_2X32T_V(pdst, offs, mask, dly)  IVP_MOVN_2X32T(IVP_GATHERN_2X32_V((pdst), (offs), (dly)), 0, mask)
+#endif // #if 0
+#endif //!defined(__XCC__) && XCHAL_HAVE_VISION
+
+#if XCHAL_VISION_QUAD_MAC_TYPE == 0
+#ifndef IVP_MULQA2N8XR8
+#define IVP_MULQA2N8XR8(_dacc_, _dvec3_, _dvec2_, _dvec1_, _dvec0_, _scalar32_)  {            \
+    xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_))); \
+    IVP_MULA2NX8(_dacc_, _dvec0_, IVP_REP2NX8(dvecS, 0));                                     \
+    IVP_MULA2NX8(_dacc_, _dvec1_, IVP_REP2NX8(dvecS, 1));                                     \
+    IVP_MULA2NX8(_dacc_, _dvec2_, IVP_REP2NX8(dvecS, 2));                                     \
+    IVP_MULA2NX8(_dacc_, _dvec3_, IVP_REP2NX8(dvecS, 3));                                     \
+}
+#endif
+
+#ifndef IVP_MULUSQA2N8XR8
+#define IVP_MULUSQA2N8XR8(_dacc_, _dvec3_, _dvec2_, _dvec1_, _dvec0_, _scalar32_)  {          \
+    xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_))); \
+    IVP_MULUSA2NX8(_dacc_, _dvec0_, IVP_REP2NX8(dvecS, 0));                                   \
+    IVP_MULUSA2NX8(_dacc_, _dvec1_, IVP_REP2NX8(dvecS, 1));                                   \
+    IVP_MULUSA2NX8(_dacc_, _dvec2_, IVP_REP2NX8(dvecS, 2));                                   \
+    IVP_MULUSA2NX8(_dacc_, _dvec3_, IVP_REP2NX8(dvecS, 3));                                   \
+}
+#endif
+
+#if 0 // Currently disabled as there is no usecase. Kept it so that it can be used in future if required.
+#ifndef IVP_MULSUQ2N8XR8
+static inline xb_vec2Nx24 IVP_MULSUQ2N8XR8(xb_vec2Nx8 _dvec3_, xb_vec2Nx8 _dvec2_, xb_vec2Nx8 _dvec1_, xb_vec2Nx8 _dvec0_, int32_t _scalar32_)
+{
+  xb_vec2Nx24 _dacc_;
+  xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_)));
+  _dacc_ = IVP_MULUS2NX8(IVP_REP2NX8(dvecS, 0), _dvec0_);
+  IVP_MULUSA2NX8(_dacc_, IVP_REP2NX8(dvecS, 1), _dvec1_);
+  IVP_MULUSA2NX8(_dacc_, IVP_REP2NX8(dvecS, 2), _dvec2_);
+  IVP_MULUSA2NX8(_dacc_, IVP_REP2NX8(dvecS, 3), _dvec3_);
+  return(_dacc_);
+}
+#endif
+
+#ifndef IVP_MULSUQA2N8XR8
+#define IVP_MULSUQA2N8XR8(_dacc_, _dvec3_, _dvec2_, _dvec1_, _dvec0_, _scalar32_)  {          \
+    xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_))); \
+    IVP_MULUSA2NX8(_dacc_, IVP_REP2NX8(dvecS, 0), _dvec0_);                                   \
+    IVP_MULUSA2NX8(_dacc_, IVP_REP2NX8(dvecS, 1), _dvec1_);                                   \
+    IVP_MULUSA2NX8(_dacc_, IVP_REP2NX8(dvecS, 2), _dvec2_);                                   \
+    IVP_MULUSA2NX8(_dacc_, IVP_REP2NX8(dvecS, 3), _dvec3_);                                   \
+}
+#endif
+
+#ifndef IVP_MULUUQA2N8XR8
+#define IVP_MULUUQA2N8XR8(_dacc_, _dvec3_, _dvec2_, _dvec1_, _dvec0_, _scalar32_)  {          \
+    xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_))); \
+    IVP_MULUUA2NX8(_dacc_, _dvec0_, IVP_REP2NX8(dvecS, 0));                                   \
+    IVP_MULUUA2NX8(_dacc_, _dvec1_, IVP_REP2NX8(dvecS, 1));                                   \
+    IVP_MULUUA2NX8(_dacc_, _dvec2_, IVP_REP2NX8(dvecS, 2));                                   \
+    IVP_MULUUA2NX8(_dacc_, _dvec3_, IVP_REP2NX8(dvecS, 3));                                   \
+}
+#endif
+#endif
+
+#ifndef IVP_MUL4TA2N8XR8
+#define IVP_MUL4TA2N8XR8(_dacc_, _dvec1_, _dvec0_, _scalar32_)  {                                            \
+    xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_)));                \
+    IVP_MULA2NX8(_dacc_, _dvec0_, IVP_REP2NX8(dvecS, 0));                                                    \
+    IVP_MULA2NX8(_dacc_, IVP_SEL2NX8I(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_1), IVP_REP2NX8(dvecS, 1)); \
+    IVP_MULA2NX8(_dacc_, IVP_SEL2NX8I(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_2), IVP_REP2NX8(dvecS, 2)); \
+    IVP_MULA2NX8(_dacc_, IVP_SEL2NX8I(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_3), IVP_REP2NX8(dvecS, 3)); \
+}
+#endif
+
+#ifndef IVP_MULUS4TA2N8XR8
+#define IVP_MULUS4TA2N8XR8(_dacc_, _dvec1_, _dvec0_, _scalar32_)  {                                             \
+    xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_)));                   \
+    IVP_MULUSA2NX8(_dacc_, _dvec0_, IVP_REP2NX8(dvecS, 0));                                                     \
+    IVP_MULUSA2NX8(_dacc_, IVP_SEL2NX8UI(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_1), IVP_REP2NX8(dvecS, 1)); \
+    IVP_MULUSA2NX8(_dacc_, IVP_SEL2NX8UI(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_2), IVP_REP2NX8(dvecS, 2)); \
+    IVP_MULUSA2NX8(_dacc_, IVP_SEL2NX8UI(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_3), IVP_REP2NX8(dvecS, 3)); \
+}
+#endif
+
+#ifndef IVP_MUL4T2N8XR8
+static inline xb_vec2Nx24 IVP_MUL4T2N8XR8(xb_vec2Nx8U _dvec1_, xb_vec2Nx8U _dvec0_, int _scalar32_)
+{
+  xb_vec2Nx24 _dacc_;
+  xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_)));
+  _dacc_ = IVP_MUL2NX8(_dvec0_, IVP_REP2NX8(dvecS, 0));
+  IVP_MULA2NX8(_dacc_, IVP_SEL2NX8I(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_1), IVP_REP2NX8(dvecS, 1));
+  IVP_MULA2NX8(_dacc_, IVP_SEL2NX8I(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_2), IVP_REP2NX8(dvecS, 2));
+  IVP_MULA2NX8(_dacc_, IVP_SEL2NX8I(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_3), IVP_REP2NX8(dvecS, 3));
+  return(_dacc_);
+}
+#endif
+
+#ifndef IVP_MULUS4T2N8XR8
+static inline xb_vec2Nx24 IVP_MULUS4T2N8XR8(xb_vec2Nx8U _dvec1_, xb_vec2Nx8U _dvec0_, int _scalar32_)
+{
+  xb_vec2Nx24 _dacc_;
+  xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_)));
+  _dacc_ = IVP_MULUS2NX8(_dvec0_, IVP_REP2NX8(dvecS, 0));
+  IVP_MULUSA2NX8(_dacc_, IVP_SEL2NX8UI(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_1), IVP_REP2NX8(dvecS, 1));
+  IVP_MULUSA2NX8(_dacc_, IVP_SEL2NX8UI(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_2), IVP_REP2NX8(dvecS, 2));
+  IVP_MULUSA2NX8(_dacc_, IVP_SEL2NX8UI(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_3), IVP_REP2NX8(dvecS, 3));
+  return(_dacc_);
+}
+#endif
+
+#ifndef IVP_MULQ2N8XR8
+static inline xb_vec2Nx24 IVP_MULQ2N8XR8(xb_vec2Nx8 _dvec3_, xb_vec2Nx8 _dvec2_, xb_vec2Nx8 _dvec1_, xb_vec2Nx8 _dvec0_, int32_t _scalar32_)
+{
+  xb_vec2Nx24 _dacc_;
+  xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_)));
+  _dacc_ = IVP_MUL2NX8(_dvec0_, IVP_REP2NX8(dvecS, 0));
+  IVP_MULA2NX8(_dacc_, _dvec1_, IVP_REP2NX8(dvecS, 1));
+  IVP_MULA2NX8(_dacc_, _dvec2_, IVP_REP2NX8(dvecS, 2));
+  IVP_MULA2NX8(_dacc_, _dvec3_, IVP_REP2NX8(dvecS, 3));
+  return(_dacc_);
+}
+#endif
+
+#ifndef IVP_MULUSQ2N8XR8
+static inline xb_vec2Nx24 IVP_MULUSQ2N8XR8(xb_vec2Nx8U _dvec3_, xb_vec2Nx8U _dvec2_, xb_vec2Nx8U _dvec1_, xb_vec2Nx8U _dvec0_, int32_t _scalar32_)
+{
+  xb_vec2Nx24 _dacc_;
+  xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_)));
+  _dacc_ = IVP_MULUS2NX8(_dvec0_, IVP_REP2NX8(dvecS, 0));
+  IVP_MULUSA2NX8(_dacc_, _dvec1_, IVP_REP2NX8(dvecS, 1));
+  IVP_MULUSA2NX8(_dacc_, _dvec2_, IVP_REP2NX8(dvecS, 2));
+  IVP_MULUSA2NX8(_dacc_, _dvec3_, IVP_REP2NX8(dvecS, 3));
+  return(_dacc_);
+}
+#endif
+#endif //#if XCHAL_VISION_QUAD_MAC_TYPE == 0
+
+#if XCHAL_HAVE_SUPERGATHER == 0
+
+#ifdef IVP_GATHERANX8S
+#undef IVP_GATHERANX8S
+static inline xb_vecNx16 IVP_GATHERANX8S(const signed char * _base, xb_vecNx16U _offsets)
+{
+  const signed char *_basePtr = _base;          \
+  xb_vecNx16U _offsetsVec     = _offsets;       \
+  xb_vecNx16 _dataVec         = (xb_vecNx16) 0; \
+  int _i;                                       \
+  for (_i = 0; _i < 32; _i++)
+  {
+                                                                                                 \
+    unsigned short offset = IVP_MOVAVU16(_offsetsVec);                                           \
+    xb_int8 gdata         = IVP_LS2NX8_X(_basePtr, offset);                                      \
+    _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1);          \
+    _dataVec    = IVP_SELNX16I(IVP_MOVNX16_FROM8(gdata), _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1); \
+  }
+  return(_dataVec);
+}
+#endif
+
+#ifdef IVP_GATHERANX8U
+#undef IVP_GATHERANX8U
+static inline xb_vecNx16U IVP_GATHERANX8U(const unsigned char * _base, xb_vecNx16U _offsets)
+{
+  const unsigned char *_basePtr = _base;
+  xb_vecNx16U _offsetsVec       = _offsets;
+  xb_vecNx16U _dataVec          = (xb_vecNx16U) 0;
+  int _i;
+  for (_i = 0; _i < 32; _i++)
+  {
+    unsigned short offset = IVP_MOVAVU16(_offsetsVec);
+    xb_int8U gdata        = IVP_LS2NX8U_X(_basePtr, offset);
+    _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1);
+    _dataVec    = IVP_SELNX16UI(IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROM8U(gdata)), _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1);
+  }
+  return(_dataVec);
+}
+#endif
+
+#ifndef IVP_GATHERD2NX8_L
+#define IVP_GATHERD2NX8_L(_gsr)  IVP_SEL2NX8I((xb_vec2Nx8) 0, IVP_MOV2NX8_FROMNX16(_gsr), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0)
+#endif
+
+#ifndef IVP_GATHERD2NX8_H
+#define IVP_GATHERD2NX8_H(_vec, _gsr)  do { xb_vec2Nx8 tmp = IVP_SEL2NX8I((xb_vec2Nx8) 0, IVP_MOV2NX8_FROMNX16(_gsr), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); \
+                                            _vec = IVP_SEL2NX8I(tmp, _vec, IVP_SELI_EXTRACT_LO_HALVES);                                                  \
+} while (0)
+#endif
+
+#ifndef IVP_GATHERD2NX8U_H
+#define IVP_GATHERD2NX8U_H(_vec, _gsr)  do { xb_vec2Nx8U tmp = IVP_SEL2NX8UI((xb_vec2Nx8U) 0, IVP_MOV2NX8U_FROMNX16(_gsr), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); \
+                                             _vec = IVP_SEL2NX8UI(tmp, _vec, IVP_SELI_EXTRACT_LO_HALVES); } while (0)
+#endif
+
+#ifndef IVP_GATHERD2NX8U_L
+#define IVP_GATHERD2NX8U_L(_gsr)  IVP_SEL2NX8UI((xb_vec2Nx8) 0, IVP_MOV2NX8_FROMNX16(_gsr), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0)
+#endif
+
+#ifdef IVP_SCATTERNX8U
+#undef IVP_SCATTERNX8U
+#define IVP_SCATTERNX8U(_dataIn, _base, _offsets)  do {                                   \
+    xb_vecNx16U _dataVec    = _dataIn;                                                    \
+    xb_vecNx16U _offsetsVec = _offsets;                                                   \
+    unsigned char *_basePtr = _base;                                                      \
+    int _i;                                                                               \
+    for (_i = 0; _i < 32; _i++) {                                                         \
+      unsigned short offset = IVP_MOVAVU16(_offsetsVec);                                  \
+      IVP_SSNX8U_X(_dataVec, _basePtr, offset);                                           \
+      _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \
+      _dataVec    = IVP_SELNX16UI(_dataVec, _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1);       \
+    }                                                                                     \
+} while (0)
+#endif
+
+#ifdef IVP_SCATTERNX8UT
+#undef IVP_SCATTERNX8UT
+#define IVP_SCATTERNX8UT(_dataIn, _base, _offsets, _vbr)  do {                            \
+    xb_vecNx16U _dataVec    = _dataIn;                                                    \
+    xb_vecNx16U _offsetsVec = _offsets;                                                   \
+    unsigned char *_basePtr = _base;                                                      \
+    xb_vecNx16 _condsVec    = IVP_MOVNX16T((xb_vecNx16) 1, (xb_vecNx16) 0, _vbr);         \
+    int _i;                                                                               \
+    for (_i = 0; _i < 32; _i++) {                                                         \
+      short cond            = IVP_MOVAV16(_condsVec);                                     \
+      unsigned short offset = IVP_MOVAVU16(_offsetsVec);                                  \
+      if (cond) {                                                                         \
+        IVP_SSNX8U_X(_dataVec, _basePtr, offset); }                                       \
+      _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \
+      _dataVec    = IVP_SELNX16UI(_dataVec, _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1);       \
+      _condsVec   = IVP_SELNX16I(_condsVec, _condsVec, IVP_SELI_16B_ROTATE_RIGHT_1);      \
+    }                                                                                     \
+} while (0)
+#endif
+
+
+#ifdef IVP_SCATTER2NX8_L
+#undef IVP_SCATTER2NX8_L
+#define IVP_SCATTER2NX8_L(_dataIn, _base, _offsets)  do {                                 \
+    xb_vec2Nx8 _dataVec     = _dataIn;                                                    \
+    xb_vecNx16U _offsetsVec = _offsets;                                                   \
+    signed char *_basePtr   = _base;                                                      \
+    int _i;                                                                               \
+    for (_i = 0; _i < 32; _i++) {                                                         \
+      unsigned short offset = IVP_MOVAVU16(_offsetsVec);                                  \
+      IVP_SS2NX8_X(_dataVec, _basePtr, offset);                                           \
+      _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \
+      _dataVec    = IVP_SEL2NX8I(_dataVec, _dataVec, IVP_SELI_8B_ROTATE_RIGHT_1);         \
+    }                                                                                     \
+} while (0)
+#endif
+
+
+#ifdef IVP_SCATTER2NX8T_L
+#undef IVP_SCATTER2NX8T_L
+#define IVP_SCATTER2NX8T_L(_dataIn, _base, _offsets, _vbr)  do {                          \
+    xb_vec2Nx8 _dataVec     = _dataIn;                                                    \
+    xb_vecNx16U _offsetsVec = _offsets;                                                   \
+    signed char *_basePtr   = _base;                                                      \
+    xb_vec2Nx8 _condsVec    = IVP_MOV2NX8T((xb_vec2Nx8) 1, (xb_vec2Nx8) 0, _vbr);         \
+    int _i;                                                                               \
+    for (_i = 0; _i < 32; _i++) {                                                         \
+      signed char cond      = IVP_MOVAV8(_condsVec);                                      \
+      unsigned short offset = IVP_MOVAVU16(_offsetsVec);                                  \
+      if (cond) {                                                                         \
+        IVP_SS2NX8_X(_dataVec, _basePtr, offset); }                                       \
+      _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \
+      _dataVec    = IVP_SEL2NX8I(_dataVec, _dataVec, IVP_SELI_8B_ROTATE_RIGHT_1);         \
+      _condsVec   = IVP_SEL2NX8I(_condsVec, _condsVec, IVP_SELI_8B_ROTATE_RIGHT_1);       \
+    }                                                                                     \
+} while (0)
+#endif
+
+#ifdef IVP_SCATTER2NX8U_L
+#undef IVP_SCATTER2NX8U_L
+#define IVP_SCATTER2NX8U_L(_dataIn, _base, _offsets)  do {                                \
+    xb_vec2Nx8U _dataVec    = _dataIn;                                                    \
+    xb_vecNx16U _offsetsVec = _offsets;                                                   \
+    unsigned char *_basePtr = _base;                                                      \
+    int _i;                                                                               \
+    for (_i = 0; _i < 32; _i++) {                                                         \
+      unsigned short offset = IVP_MOVAVU16(_offsetsVec);                                  \
+      IVP_SS2NX8U_X(_dataVec, _basePtr, offset);                                          \
+      _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \
+      _dataVec    = IVP_SEL2NX8UI(_dataVec, _dataVec, IVP_SELI_8B_ROTATE_RIGHT_1);        \
+    }                                                                                     \
+} while (0)
+#endif
+
+#ifdef IVP_SCATTER2NX8UT_L
+#undef IVP_SCATTER2NX8UT_L
+#define IVP_SCATTER2NX8UT_L(_dataIn, _base, _offsets, _vbr)  do {                         \
+    xb_vec2Nx8U _dataVec    = _dataIn;                                                    \
+    xb_vecNx16U _offsetsVec = _offsets;                                                   \
+    unsigned char *_basePtr = _base;                                                      \
+    xb_vec2Nx8 _condsVec    = IVP_MOV2NX8T((xb_vec2Nx8) 1, (xb_vec2Nx8) 0, _vbr);         \
+    int _i;                                                                               \
+    for (_i = 0; _i < 32; _i++) {                                                         \
+      signed char cond      = IVP_MOVAV8(_condsVec);                                      \
+      unsigned short offset = IVP_MOVAVU16(_offsetsVec);                                  \
+      if (cond) {                                                                         \
+        IVP_SS2NX8U_X(_dataVec, _basePtr, offset); }                                      \
+      _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \
+      _dataVec    = IVP_SEL2NX8UI(_dataVec, _dataVec, IVP_SELI_8B_ROTATE_RIGHT_1);        \
+      _condsVec   = IVP_SEL2NX8I(_condsVec, _condsVec, IVP_SELI_8B_ROTATE_RIGHT_1);       \
+    }                                                                                     \
+} while (0)
+#endif
+
+#ifdef IVP_SCATTER2NX8_H
+#undef IVP_SCATTER2NX8_H
+#define IVP_SCATTER2NX8_H(_dataIn, _base, _offsets)  do {                                 \
+    xb_vec2Nx8 _dataVec     = _dataIn;                                                    \
+    xb_vecNx16U _offsetsVec = _offsets;                                                   \
+    signed char *_basePtr   = _base;                                                      \
+    _dataVec = IVP_SEL2NX8I(_dataVec, _dataVec, IVP_SELI_8B_EXTRACT_HI_HALVES);           \
+    int _i;                                                                               \
+    for (_i = 0; _i < 32; _i++) {                                                         \
+      unsigned short offset = IVP_MOVAVU16(_offsetsVec);                                  \
+      IVP_SS2NX8_X(_dataVec, _basePtr, offset);                                           \
+      _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \
+      _dataVec    = IVP_SEL2NX8I(_dataVec, _dataVec, IVP_SELI_8B_ROTATE_RIGHT_1);         \
+    }                                                                                     \
+} while (0)
+#endif
+
+#ifdef IVP_SCATTER2NX8U_H
+#undef IVP_SCATTER2NX8U_H
+#define IVP_SCATTER2NX8U_H(_dataIn, _base, _offsets)  do {                                \
+    xb_vec2Nx8U _dataVec    = _dataIn;                                                    \
+    xb_vecNx16U _offsetsVec = _offsets;                                                   \
+    unsigned char *_basePtr = _base;                                                      \
+    _dataVec = IVP_SEL2NX8UI(_dataVec, _dataVec, IVP_SELI_8B_EXTRACT_HI_HALVES);          \
+    int _i;                                                                               \
+    for (_i = 0; _i < 32; _i++) {                                                         \
+      unsigned short offset = IVP_MOVAVU16(_offsetsVec);                                  \
+      IVP_SS2NX8U_X(_dataVec, _basePtr, offset);                                          \
+      _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \
+      _dataVec    = IVP_SEL2NX8UI(_dataVec, _dataVec, IVP_SELI_8B_ROTATE_RIGHT_1);        \
+    }                                                                                     \
+} while (0)
+#endif
+
+#ifdef IVP_SCATTER2NX8T_H
+#undef IVP_SCATTER2NX8T_H
+#define IVP_SCATTER2NX8T_H(_dataIn, _base, _offsets, _vbr)  do {                          \
+    xb_vec2Nx8 _dataVec     = _dataIn;                                                    \
+    xb_vecNx16U _offsetsVec = _offsets;                                                   \
+    signed char *_basePtr   = _base;                                                      \
+    xb_vec2Nx8 _condsVec    = IVP_MOV2NX8T((xb_vec2Nx8) 1, (xb_vec2Nx8) 0, _vbr);         \
+    _dataVec  = IVP_SEL2NX8I(_dataVec, _dataVec, IVP_SELI_8B_EXTRACT_HI_HALVES);          \
+    _condsVec = IVP_SEL2NX8I(_condsVec, _condsVec, IVP_SELI_8B_EXTRACT_HI_HALVES);        \
+    int _i;                                                                               \
+    for (_i = 0; _i < 32; _i++) {                                                         \
+      signed char cond      = IVP_MOVAV8(_condsVec);                                      \
+      unsigned short offset = IVP_MOVAVU16(_offsetsVec);                                  \
+      if (cond) {                                                                         \
+        IVP_SS2NX8_X(_dataVec, _basePtr, offset); }                                       \
+      _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \
+      _dataVec    = IVP_SEL2NX8I(_dataVec, _dataVec, IVP_SELI_8B_ROTATE_RIGHT_1);         \
+      _condsVec   = IVP_SEL2NX8I(_condsVec, _condsVec, IVP_SELI_8B_ROTATE_RIGHT_1);       \
+    }                                                                                     \
+} while (0)
+#endif
+
+#ifdef IVP_SCATTER2NX8UT_H
+#undef IVP_SCATTER2NX8UT_H
+#define IVP_SCATTER2NX8UT_H(_dataIn, _base, _offsets, _vbr)  do {                         \
+    xb_vec2Nx8U _dataVec    = _dataIn;                                                    \
+    xb_vecNx16U _offsetsVec = _offsets;                                                   \
+    unsigned char *_basePtr = _base;                                                      \
+    xb_vec2Nx8 _condsVec    = IVP_MOV2NX8T((xb_vec2Nx8) 1, (xb_vec2Nx8) 0, _vbr);         \
+    _dataVec  = IVP_SEL2NX8UI(_dataVec, _dataVec, IVP_SELI_8B_EXTRACT_HI_HALVES);         \
+    _condsVec = IVP_SEL2NX8I(_condsVec, _condsVec, IVP_SELI_8B_EXTRACT_HI_HALVES);        \
+    int _i;                                                                               \
+    for (_i = 0; _i < 32; _i++) {                                                         \
+      signed char cond      = IVP_MOVAV8(_condsVec);                                      \
+      unsigned short offset = IVP_MOVAVU16(_offsetsVec);                                  \
+      if (cond) {                                                                         \
+        IVP_SS2NX8U_X(_dataVec, _basePtr, offset); }                                      \
+      _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \
+      _dataVec    = IVP_SEL2NX8UI(_dataVec, _dataVec, IVP_SELI_8B_ROTATE_RIGHT_1);        \
+      _condsVec   = IVP_SEL2NX8I(_condsVec, _condsVec, IVP_SELI_8B_ROTATE_RIGHT_1);       \
+    }                                                                                     \
+} while (0)
+#endif
+
+#ifdef IVP_SCATTERNX16
+#undef IVP_SCATTERNX16
+#define IVP_SCATTERNX16(_dataIn, _base, _offsets)  do {                                   \
+    xb_vecNx16 _dataVec     = _dataIn;                                                    \
+    xb_vecNx16U _offsetsVec = _offsets;                                                   \
+    short *_basePtr         = _base;                                                      \
+    int _i;                                                                               \
+    for (_i = 0; _i < 32; _i++) {                                                         \
+      unsigned short offset = IVP_MOVAVU16(_offsetsVec);                                  \
+      IVP_SSNX16_X(_dataVec, _basePtr, offset);                                           \
+      _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \
+      _dataVec    = IVP_SELNX16I(_dataVec, _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1);        \
+    }                                                                                     \
+} while (0)
+#endif
+
+#ifdef IVP_SCATTERNX16U
+#undef IVP_SCATTERNX16U
+#define IVP_SCATTERNX16U(_dataIn, _base, _offsets)  do {                                  \
+    xb_vecNx16U _dataVec     = _dataIn;                                                   \
+    xb_vecNx16U _offsetsVec  = _offsets;                                                  \
+    unsigned short *_basePtr = _base;                                                     \
+    int _i;                                                                               \
+    for (_i = 0; _i < 32; _i++) {                                                         \
+      unsigned short offset = IVP_MOVAVU16(_offsetsVec);                                  \
+      IVP_SSNX16U_X(_dataVec, _basePtr, offset);                                          \
+      _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \
+      _dataVec    = IVP_SELNX16UI(_dataVec, _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1);       \
+    }                                                                                     \
+} while (0)
+#endif
+
+#ifdef IVP_SCATTERNX16T
+#undef IVP_SCATTERNX16T
+#define IVP_SCATTERNX16T(_dataIn, _base, _offsets, _vbr)  do {                            \
+    xb_vecNx16 _dataVec     = _dataIn;                                                    \
+    xb_vecNx16U _offsetsVec = _offsets;                                                   \
+    short *_basePtr         = (short *) _base;                                            \
+    xb_vecNx16 _condsVec    = IVP_MOVNX16T((xb_vecNx16) 1, (xb_vecNx16) 0, _vbr);         \
+    int _i;                                                                               \
+    for (_i = 0; _i < 32; _i++) {                                                         \
+      unsigned short offset = IVP_MOVAVU16(_offsetsVec);                                  \
+      short cond            = IVP_MOVAV16(_condsVec);                                     \
+      if (cond) {                                                                         \
+        IVP_SSNX16_X(_dataVec, _basePtr, offset); }                                       \
+      _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \
+      _dataVec    = IVP_SELNX16I(_dataVec, _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1);        \
+      _condsVec   = IVP_SELNX16I(_condsVec, _condsVec, IVP_SELI_16B_ROTATE_RIGHT_1);      \
+    }                                                                                     \
+} while (0)
+#endif
+
+#ifdef IVP_SCATTERNX16UT
+#undef IVP_SCATTERNX16UT
+#define IVP_SCATTERNX16UT(_dataIn, _base, _offsets, _vbr)  do {                           \
+    xb_vecNx16U _dataVec     = _dataIn;                                                   \
+    xb_vecNx16U _offsetsVec  = _offsets;                                                  \
+    unsigned short *_basePtr = _base;                                                     \
+    xb_vecNx16 _condsVec     = IVP_MOVNX16T((xb_vecNx16) 1, (xb_vecNx16) 0, _vbr);        \
+    int _i;                                                                               \
+    for (_i = 0; _i < 32; _i++) {                                                         \
+      unsigned short offset = IVP_MOVAVU16(_offsetsVec);                                  \
+      short cond            = IVP_MOVAV16(_condsVec);                                     \
+      if (cond) {                                                                         \
+        IVP_SSNX16U_X(_dataVec, _basePtr, offset); }                                      \
+      _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \
+      _dataVec    = IVP_SELNX16UI(_dataVec, _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1);       \
+      _condsVec   = IVP_SELNX16I(_condsVec, _condsVec, IVP_SELI_16B_ROTATE_RIGHT_1);      \
+    }                                                                                     \
+} while (0)
+#endif
+
+#ifdef IVP_SCATTERN_2X32
+#undef IVP_SCATTERN_2X32
+#define IVP_SCATTERN_2X32(_dataIn, _base, _offsets)  do {                                   \
+    xb_vecN_2x32v _dataVec     = _dataIn;                                                   \
+    xb_vecN_2x32Uv _offsetsVec = _offsets;                                                  \
+    int *_basePtr              = _base;                                                     \
+    int _i;                                                                                 \
+    for (_i = 0; _i < 16; _i++) {                                                           \
+      unsigned int offset = IVP_MOVAV32(_offsetsVec);                                       \
+      IVP_SSN_2X32_X(_dataVec, _basePtr, offset);                                           \
+      _offsetsVec = IVP_SELN_2X32UI(_offsetsVec, _offsetsVec, IVP_SELI_32B_ROTATE_RIGHT_1); \
+      _dataVec    = IVP_SELN_2X32I(_dataVec, _dataVec, IVP_SELI_32B_ROTATE_RIGHT_1);        \
+    }                                                                                       \
+} while (0)
+#endif
+
+#ifdef IVP_SCATTERN_2X32T
+#undef IVP_SCATTERN_2X32T
+#define IVP_SCATTERN_2X32T(_dataIn, _base, _offsets, _vbr)  do {                             \
+    xb_vecN_2x32v _dataVec     = _dataIn;                                                    \
+    xb_vecN_2x32Uv _offsetsVec = _offsets;                                                   \
+    int *_basePtr              = _base;                                                      \
+    xb_vecN_2x32v _condsVec    = IVP_MOVN_2X32T((xb_vecN_2x32v) 1, (xb_vecN_2x32v) 0, _vbr); \
+    int _i;                                                                                  \
+    for (_i = 0; _i < 16; _i++) {                                                            \
+      int cond            = IVP_MOVAV32(_condsVec);                                          \
+      unsigned int offset = IVP_MOVAV32(_offsetsVec);                                        \
+      if (cond) {                                                                            \
+        IVP_SSN_2X32_X(_dataVec, _basePtr, offset); }                                        \
+      _offsetsVec = IVP_SELN_2X32UI(_offsetsVec, _offsetsVec, IVP_SELI_32B_ROTATE_RIGHT_1);  \
+      _dataVec    = IVP_SELN_2X32I(_dataVec, _dataVec, IVP_SELI_32B_ROTATE_RIGHT_1);         \
+      _condsVec   = IVP_SELN_2X32I(_condsVec, _condsVec, IVP_SELI_32B_ROTATE_RIGHT_1);       \
+    }                                                                                        \
+} while (0)
+#endif
+
+#ifdef IVP_SCATTERN_2X32U
+#undef IVP_SCATTERN_2X32U
+#define IVP_SCATTERN_2X32U(_dataIn, _base, _offsets)  do {                                  \
+    xb_vecN_2x32Uv _dataVec    = _dataIn;                                                   \
+    xb_vecN_2x32Uv _offsetsVec = _offsets;                                                  \
+    unsigned int *_basePtr     = _base;                                                     \
+    int _i;                                                                                 \
+    for (_i = 0; _i < 16; _i++) {                                                           \
+      unsigned int offset = IVP_MOVAV32(_offsetsVec);                                       \
+      IVP_SSN_2X32U_X(_dataVec, _basePtr, offset);                                          \
+      _offsetsVec = IVP_SELN_2X32UI(_offsetsVec, _offsetsVec, IVP_SELI_32B_ROTATE_RIGHT_1); \
+      _dataVec    = IVP_SELN_2X32UI(_dataVec, _dataVec, IVP_SELI_32B_ROTATE_RIGHT_1);       \
+    }                                                                                       \
+} while (0)
+#endif
+
+#ifdef IVP_SCATTERN_2X32UT
+#undef IVP_SCATTERN_2X32UT
+#define IVP_SCATTERN_2X32UT(_dataIn, _base, _offsets, _vbr)  do {                            \
+    xb_vecN_2x32Uv _dataVec    = _dataIn;                                                    \
+    xb_vecN_2x32Uv _offsetsVec = _offsets;                                                   \
+    unsigned int *_basePtr     = _base;                                                      \
+    xb_vecN_2x32v _condsVec    = IVP_MOVN_2X32T((xb_vecN_2x32v) 1, (xb_vecN_2x32v) 0, _vbr); \
+    int _i;                                                                                  \
+    for (_i = 0; _i < 16; _i++) {                                                            \
+      int cond            = IVP_MOVAV32(_condsVec);                                          \
+      unsigned int offset = IVP_MOVAV32(_offsetsVec);                                        \
+      if (cond) {                                                                            \
+        IVP_SSN_2X32U_X(_dataVec, _basePtr, offset); }                                       \
+      _offsetsVec = IVP_SELN_2X32UI(_offsetsVec, _offsetsVec, IVP_SELI_32B_ROTATE_RIGHT_1);  \
+      _dataVec    = IVP_SELN_2X32UI(_dataVec, _dataVec, IVP_SELI_32B_ROTATE_RIGHT_1);        \
+      _condsVec   = IVP_SELN_2X32I(_condsVec, _condsVec, IVP_SELI_32B_ROTATE_RIGHT_1);       \
+    }                                                                                        \
+} while (0)
+#endif
+
+#ifdef IVP_GATHERANX16U
+#undef IVP_GATHERANX16U
+static inline xb_vecNx16 IVP_GATHERANX16U(const uint16_t *_base, xb_vecNx16U _offsets)
+{
+  const unsigned short *_basePtr = _base;
+  xb_vecNx16U _offsetsVec        = _offsets;
+  xb_vecNx16U _dataVec           = (xb_vecNx16U) 0;
+  int _i;
+  for (_i = 0; _i < 32; _i++)
+  {
+    unsigned short offset = IVP_MOVAVU16(_offsetsVec);
+    xb_int16U gdata       = IVP_LSNX16U_X(_basePtr, offset);
+    _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1);
+    _dataVec    = IVP_SELNX16UI(IVP_MOVNX16U_FROM16U(gdata), _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1);
+  }
+  return(IVP_MOVNX16_FROMNX16U(_dataVec));
+}
+#endif
+
+#ifdef IVP_GATHERANX16
+#undef IVP_GATHERANX16
+static inline xb_vecNx16 IVP_GATHERANX16(const int16_t *_base, xb_vecNx16U _offsets)
+{
+  const short *_basePtr   = _base;
+  xb_vecNx16U _offsetsVec = _offsets;
+  xb_vecNx16 _dataVec     = (xb_vecNx16) 0;
+  int _i;
+  for (_i = 0; _i < 32; _i++)
+  {
+    unsigned short offset = IVP_MOVAVU16(_offsetsVec);
+    xb_int16 gdata        = IVP_LSNX16_X(_basePtr, offset);
+    _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1);
+    _dataVec    = IVP_SELNX16I(IVP_MOVNX16_FROM16(gdata), _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1);
+  }
+  return(_dataVec);
+}
+#endif
+
+#ifdef IVP_GATHERANX16T
+#undef IVP_GATHERANX16T
+static inline xb_vecNx16 IVP_GATHERANX16T(const int16_t *_base, xb_vecNx16U _offsets, vboolN _vbr)
+{
+  const short *_basePtr   = _base;
+  xb_vecNx16U _offsetsVec = _offsets;
+  vboolN _boolVec         = _vbr;
+  xb_vecNx16 _dataVec     = (xb_vecNx16) 0;
+  int _i;
+  for (_i = 0; _i < 32; _i++)
+  {
+    unsigned short offset = IVP_MOVAVU16(_offsetsVec);
+    xb_int16 gdata        = IVP_LSNX16_X(_basePtr, offset);
+    _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1);
+    _dataVec    = IVP_SELNX16I(IVP_MOVNX16_FROM16(gdata), _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1);
+  }
+  return(IVP_MOVNX16T(_dataVec, (xb_vecNx16) 0, _boolVec));
+}
+#endif
+
+#ifdef IVP_GATHERANX16UT
+#undef IVP_GATHERANX16UT
+static inline xb_vecNx16 IVP_GATHERANX16UT(const uint16_t *_base, xb_vecNx16U _offsets, vboolN _vbr)
+{
+  const unsigned short *_basePtr = _base;
+  xb_vecNx16U _offsetsVec        = _offsets;
+  vboolN _boolVec                = _vbr;
+  xb_vecNx16U _dataVec           = (xb_vecNx16U) 0;
+  int _i;
+  for (_i = 0; _i < 32; _i++)
+  {
+    unsigned short offset = IVP_MOVAVU16(_offsetsVec);
+    xb_int16U gdata       = IVP_LSNX16U_X(_basePtr, offset);
+    _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1);
+    _dataVec    = IVP_SELNX16UI(IVP_MOVNX16U_FROM16U(gdata), _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1);
+  }
+  return(IVP_MOVNX16_FROMNX16U(IVP_MOVNX16UT(_dataVec, (xb_vecNx16U) 0, _boolVec)));
+}
+#endif
+
+#ifdef IVP_GATHERAN_2X32
+#undef IVP_GATHERAN_2X32
+static inline xb_vecNx16 IVP_GATHERAN_2X32(const int32_t *_base, xb_vecN_2x32Uv _offsets)
+{
+  const int *_basePtr        = _base;
+  xb_vecN_2x32Uv _offsetsVec = _offsets;
+  xb_vecN_2x32v _dataVec     = (xb_vecN_2x32v) 0;
+  int _i;
+  for (_i = 0; _i < 16; _i++)
+  {
+    unsigned int offset = IVP_MOVAV32(_offsetsVec);
+    xb_int32v gdata     = IVP_LSN_2X32_X(_basePtr, offset);
+    _offsetsVec = IVP_SELN_2X32UI(_offsetsVec, _offsetsVec, IVP_SELI_32B_ROTATE_RIGHT_1);
+    _dataVec    = IVP_SELN_2X32I(IVP_MOVN_2X32_FROM32(gdata), _dataVec, IVP_SELI_32B_ROTATE_RIGHT_1);
+  }
+  return(IVP_MOVNX16_FROMN_2X32(_dataVec));
+}
+#endif
+
+#ifdef IVP_GATHERAN_2X32T
+#undef IVP_GATHERAN_2X32T
+static inline xb_vecNx16  IVP_GATHERAN_2X32T(const int32_t *_base, xb_vecN_2x32Uv _offsets, vboolN_2 _vbr)
+{
+  const int *_basePtr        = _base;
+  xb_vecN_2x32Uv _offsetsVec = _offsets;
+  vboolN_2 _boolVec          = _vbr;
+  xb_vecN_2x32v _dataVec     = (xb_vecN_2x32v) 0;
+  int _i;
+  for (_i = 0; _i < 16; _i++)
+  {
+    unsigned int offset = IVP_MOVAV32(_offsetsVec);
+    xb_int32v gdata     = IVP_LSN_2X32_X(_basePtr, offset);
+    _offsetsVec = IVP_SELN_2X32UI(_offsetsVec, _offsetsVec, IVP_SELI_32B_ROTATE_RIGHT_1);
+    _dataVec    = IVP_SELN_2X32I(IVP_MOVN_2X32_FROM32(gdata), _dataVec, IVP_SELI_32B_ROTATE_RIGHT_1);
+  }
+  return(IVP_MOVNX16_FROMN_2X32(IVP_MOVN_2X32T(_dataVec, (xb_vecN_2x32v) 0, _boolVec)));
+}
+#endif
+
+#ifdef IVP_GATHERNX8UT_V
+#undef IVP_GATHERNX8UT_V
+#define IVP_GATHERNX8UT_V(pdst, offs, mask, dly)  IVP_MOVNX16T(IVP_GATHERNX8U_V((pdst), (offs), (dly)), 0, mask)
+#endif
+#endif // XCHAL_HAVE_SUPERGATHER == 0
+
+////////// protos extension
+
+// 32-way wide vector (48-bit) element high 16-bits output to narrow (16-bit) output vector register
+#ifndef IVP_PACKHNX48
+#   define IVP_PACKHNX48(vec)  IVP_PACKVRNR2NX24_1(IVP_MOV2NX24_FROMNX48(vec), 8)
+#endif
+
+// reinterpret 64 8-bit elements as 16 32-bit elements
+#ifndef IVP_MOVN_2X32_FROM2NX8
+#   define IVP_MOVN_2X32_FROM2NX8(vec)  IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(vec))
+#endif
+
+// reinterpret 16 32-bit elements as 64 8-bit elements
+#ifndef IVP_MOV2NX8_FROMN_2X32
+#   define IVP_MOV2NX8_FROMN_2X32(vec)  IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(vec))
+#endif
+
+#ifndef IVP_SELN_2X32I
+#   define IVP_SELN_2X32I(a, b, i)  IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32(a), IVP_MOVNX16_FROMN_2X32(b), i))
+#endif
+
+// 0 to 63 sequence xb_vec2Nx8U vector
+#ifndef IVP_SEQ2NX8U
+#   define IVP_SEQ2NX8U()  IVP_MOV2NX8U_FROMNX16(IVP_ADDNX16U(256, IVP_MULNX16UPACKL(514, IVP_SEQNX16())))
+#endif
+
+// 64-way 8-bit zero
+#ifndef IVP_ZERO2NX8U
+#   define IVP_ZERO2NX8U()  IVP_MOV2NX8U_FROMNX16(IVP_ZERONX16())
+#endif
+
+// 16-way 32-bit zero
+#ifndef IVP_ZERON_2X32U
+#   define IVP_ZERON_2X32U()  IVP_MOVN_2X32U_FROMNX16(IVP_ZERONX16())
+#endif
+
+// 64-way 24-bit zero
+#ifndef IVP_ZERO2NX24
+#   define IVP_ZERO2NX24()  IVP_MOV2NX24_FROMNX48(IVP_ZERONX48())
+#endif
+
+// 32-way 48-bit zero
+#ifndef IVP_ZERONX48
+#   if XCHAL_HAVE_VISION
+#       define IVP_ZERONX48()  (IVP_CVT48UNX32L(IVP_ZERON_2X32U()))
+#   else
+#       define IVP_ZERONX48()  (IVP_MOVWVL(IVP_ZERONX16()))
+#   endif
+#endif
+
+////////// compatibility between IVPEP - VP5
+#if XCHAL_HAVE_VISION
+
+typedef xb_vecNx16  vsaN;
+
+#   define IVP_MOVWVL(a)                  IVP_CVT48UNX32L(a)
+#   define IVP_MOVV2WHH(a)                IVP_MOVNX16_FROMN_2X32(IVP_CVT32S2NX24HH(IVP_MOV2NX24_FROMNX48(a)))
+#   define IVP_MOVV2WHL(a)                IVP_MOVNX16_FROMN_2X32(IVP_CVT32S2NX24HL(IVP_MOV2NX24_FROMNX48(a)))
+#   define IVP_MOVV2WLH(a)                IVP_MOVNX16_FROMN_2X32(IVP_CVT32S2NX24LH(IVP_MOV2NX24_FROMNX48(a)))
+#   define IVP_MOVV2WLL(a)                IVP_MOVNX16_FROMN_2X32(IVP_CVT32S2NX24LL(IVP_MOV2NX24_FROMNX48(a)))
+#   define IVP_MOVSVWH(a)                 IVP_MOVNX16_FROMN_2X32(IVP_CVT32SNX48H(a))
+#   define IVP_MOVSVWL(a)                 IVP_MOVNX16_FROMN_2X32(IVP_CVT32SNX48L(a))
+#   define IVP_MOVVWHH(a)                 IVP_MOVNX16_FROM2NX8(IVP_CVT64SNX48HH(a))
+#   define IVP_MOVVWHL(a)                 IVP_MOVNX16_FROM2NX8(IVP_CVT64SNX48HL(a))
+#   define IVP_MOVVWLH(a)                 IVP_MOVNX16_FROM2NX8(IVP_CVT64SNX48LH(a))
+#   define IVP_MOVVWLL(a)                 IVP_MOVNX16_FROM2NX8(IVP_CVT64SNX48LL(a))
+#   define IVP_MOVV2WL(a)                 IVP_CVT16U2NX24L(IVP_MOV2NX24_FROMNX48(a))
+#   define IVP_MOVV2WH(a)                 IVP_CVT16U2NX24H(IVP_MOV2NX24_FROMNX48(a))
+#   define IVP_MOVVWL(a)                  IVP_MOVNX16_FROMN_2X32(IVP_CVT32UNX48L(a))
+#   define IVP_MOVVWH(a)                  IVP_MOVNX16_FROMN_2X32(IVP_CVT32UNX48H(a))
+#   define IVP_MOVSV2WL(a)                IVP_CVT16S2NX24L(IVP_MOV2NX24_FROMNX48(a))
+#   define IVP_MOVSV2WH(a)                IVP_CVT16S2NX24H(IVP_MOV2NX24_FROMNX48(a))
+#   define IVP_MOV2W2VL(a, b)             IVP_MOVNX48_FROM2NX24(IVP_CVT24UNX32L(IVP_MOVN_2X32_FROMNX16(a), IVP_MOVN_2X32_FROMNX16(b)))
+#   define IVP_MOVSWV(a, b)               IVP_CVT48SNX32(IVP_MOVN_2X32_FROMNX16(a), IVP_MOVN_2X32_FROMNX16(b))
+#   define IVP_MOVS2WV(a, b)              IVP_MOVNX48_FROM2NX24(IVP_CVT24S2NX16(a, b))
+#   define IVP_MOVWV(a, b)                IVP_CVT48UNX32(IVP_MOVN_2X32_FROMNX16(a), IVP_MOVN_2X32_FROMNX16(b))
+
+#   define IVP_MOVVVS(a)                  (a)
+#   define IVP_MOVVSA32(a)                IVP_MOVVA16(a)
+#   define IVP_MOVVSV(vr, sa)             (vr) // sa is always zero in XI, if not zero -> use IVP_MOVVSELNX16
+#   define IVP_MOVVSELNX16(vr, sa)        IVP_SRLINX16(vr, sa)
+#   define IVP_MOVVSVADDNX16(a, b, c, d)  { a = c; c = IVP_ADDNX16(c, b); } // d is always zero in XI
+#   define IVP_MOVPVSV(a, b, c, d)        { xb_vec2Nx8 t = IVP_SRLI2NX8(c, d); a = IVP_UNPKS2NX8_1(t); b = IVP_UNPKS2NX8_0(t); }
+
+#undef IVP_LSNX8U_XP
+#undef IVP_LSNX8U_IP
+#undef IVP_LSNX8U_X
+#undef IVP_LSNX8U_I
+#   define IVP_LSNX8U_XP(a, b, c)  do { xb_int8U tmp; IVP_LS2NX8U_XP(tmp, b, c); a = IVP_MOVNX16_FROM8U(tmp); } while (0)
+#   define IVP_LSNX8U_IP(a, b, c)  do { xb_int8U tmp; IVP_LS2NX8U_IP(tmp, b, c); a = IVP_MOVNX16_FROM8U(tmp); } while (0)
+#   define IVP_LSNX8U_X(b, c)      IVP_MOVNX16_FROM8U(IVP_LS2NX8U_X(b, c))
+#   define IVP_LSNX8U_I(b, c)      IVP_MOVNX16_FROM8U(IVP_LS2NX8U_I(b, c))
+
+#   define IVP_PACKLNX48_L(a)      IVP_CVT32UNX48L(a)
+#   define IVP_PACKLNX48_H(a)      IVP_CVT32UNX48H(a)
+
+#   define IVP_SA2NX8UPOS_FP    IVP_SAPOS2NX8U_FP
+#   define IVP_SAN_2X32POS_FP   IVP_SAPOSN_2X32_FP
+#   define IVP_SANX16POS_FP     IVP_SAPOSNX16_FP
+#   define IVP_SANX16UPOS_FP    IVP_SAPOSNX16U_FP
+#   define IVP_SANX8UPOS_FP     IVP_SAPOSNX8U_FP
+#   define IVP_SAV2NX8POS_FP    IVP_SAPOS2NX8_FP
+#   define IVP_SAV2NX8UPOS_FP   IVP_SAPOS2NX8U_FP
+#   define IVP_SAVN_2X32POS_FP  IVP_SAPOSN_2X32_FP
+#   define IVP_SAVNX16POS_FP    IVP_SAPOSNX16_FP
+#   define IVP_SAVNX16UPOS_FP   IVP_SAPOSNX16U_FP
+#   define IVP_SAVNX8UPOS_FP    IVP_SAPOSNX8U_FP
+#   define IVP_LAVNX8U_PP       IVP_LANX8U_PP
+#   define IVP_LAVNX16_PP       IVP_LANX16_PP
+
+#   define IVP_RADDURNX16(b)           ((int) IVP_RADDUNX16(b))
+#   define IVP_RADDRNX16(b)            ((int) IVP_RADDNX16(b))
+#   define IVP_ADDSNX16F(a, b, c, d)   IVP_ADDSNX16T(a, b, c, IVP_NOTBN(d))
+#   define IVP_ADDNX16F(a, b, c, d)    IVP_ADDNX16T(a, b, c, IVP_NOTBN(d))
+#   define IVP_SUBNX16F(a, b, c, d)    IVP_SUBNX16T(a, b, c, IVP_NOTBN(d))
+#   define IVP_NEGNX16F(a, b, c)       IVP_NEGNX16T(a, b, IVP_NOTBN(c))
+#   define IVP_NEGSNX16F(a, b, c)      IVP_NEGSNX16T(a, b, IVP_NOTBN(c))
+#   define IVP_RMINNX16F(b, c)         IVP_RMINNX16T(b, IVP_NOTBN(c))
+#   define IVP_MINUNX16F(a, b, c, d)   IVP_MINUNX16T(a, b, c, IVP_NOTBN(d))
+#   define IVP_SVNX8UF_XP(a, b, c, d)  IVP_SVNX8UT_XP(a, b, c, IVP_NOTBN(d))
+#   define IVP_SVNX8UF_I(a, b, c, d)   IVP_SVNX8UT_I(a, b, c, IVP_NOTBN(d))
+#   define IVP_SVNX16F_XP(a, b, c, d)  IVP_SVNX16T_XP(a, b, c, IVP_NOTBN(d))
+#   define IVP_SVNX16F_I(a, b, c, d)   IVP_SVNX16T_I(a, b, c, IVP_NOTBN(d))
+#endif
+
+#if XCHAL_HAVE_VISION
+#   define IVP__LSNX16_XP(a, b, c)  do { xb_int16 tmp; IVP_LSNX16_XP(tmp, b, c); a = IVP_MOVNX16_FROM16(tmp); } while (0)
+#else
+#   define IVP__LSNX16_XP  IVP_LSNX16_XP
+#endif
+
+#if XCHAL_HAVE_VISION
+#   define IVP__LSNX16_IP(a, b, c)  do { xb_int16 tmp; IVP_LSNX16_IP(tmp, b, c); a = IVP_MOVNX16_FROM16(tmp); } while (0)
+#else
+#   define IVP__LSNX16_IP  IVP_LSNX16_IP
+#endif
+
+#if XCHAL_HAVE_VISION
+#   define IVP__DSELNX16_2X16(a, b, c, d, e, f)  { \
+    xb_vecNx16 _v0, _v1;                           \
+    _v0 = d;                                       \
+    _v1 = c;                                       \
+    a   = IVP_SELNX16(_v1, _v0, e);                \
+    b   = IVP_SELNX16(_v1, _v0, f);                \
+}
+#else
+#   define IVP__DSELNX16_2X16  IVP_DSELNX16
+#endif
+
+#if XCHAL_HAVE_VISION
+#   define IVP__SEL2NX8_2X16(b, c, d, e)  IVP_SEL2NX8(b, c, IVP_SEL2NX8I(IVP_MOV2NX8_FROMNX16(d), IVP_MOV2NX8_FROMNX16(e), IVP_SELI_8B_INTERLEAVE_1_EVEN))
+#else
+#   define IVP__SEL2NX8_2X16  IVP_SEL2NX8
+#endif
+
+////////// compatibility for RF-2014.0 IVP-EP cores
+
+#ifndef IVP_SVN_2X32_IP
+#define IVP_SVN_2X32_IP(a, b, c)                     \
+  do {                                               \
+    xb_vecNx16 *bb = (xb_vecNx16 *) b;               \
+    IVP_SVNX16_IP(IVP_MOVNX16_FROMN_2X32(a), bb, c); \
+    b = (xb_vecN_2x32v *) bb;                        \
+  } while (0)
+#endif
+
+#ifndef IVP_SVN_2X32_XP
+#define IVP_SVN_2X32_XP(a, b, c)                     \
+  do {                                               \
+    xb_vecNx16 *bb = (xb_vecNx16 *) b;               \
+    IVP_SVNX16_XP(IVP_MOVNX16_FROMN_2X32(a), bb, c); \
+    b = (xb_vecN_2x32v *) bb;                        \
+  } while (0)
+#endif
+
+#ifndef IVP_LVN_2X32_IP
+#define IVP_LVN_2X32_IP(a, b, c)             \
+  do {                                       \
+    xb_vecNx16 *bb = (xb_vecNx16 *) b;       \
+    xb_vecNx16 aa; IVP_LVNX16_IP(aa, bb, c); \
+    a = IVP_MOVN_2X32_FROMNX16(aa);          \
+    b = (xb_vecN_2x32v *) bb;                \
+  } while (0)
+#endif
+
+#ifndef IVP_LVN_2X32_XP
+#define IVP_LVN_2X32_XP(a, b, c)             \
+  do {                                       \
+    xb_vecNx16 *bb = (xb_vecNx16 *) b;       \
+    xb_vecNx16 aa; IVP_LVNX16_XP(aa, bb, c); \
+    a = IVP_MOVN_2X32_FROMNX16(aa);          \
+    b = (xb_vecN_2x32v *) bb;                \
+  } while (0)
+#endif
+
+////////// select/shuffle indexes
+#if XCHAL_HAVE_VISION
+#define XAI_DSEL_16B_ROTATE_LEFT(n)   IVP_AVGU2NX8(IVP_SEQ2NX8(), IVP_MOV2NX8_FROMNX16((0x4000 - 2 * (((n) << 8) + (n)))))
+#define XAI_DSEL_16B_ROTATE_RIGHT(n)  IVP_AVGU2NX8(IVP_SEQ2NX8(), IVP_MOV2NX8_FROMNX16((0x3F00 + 2 * (((n) << 8) + (n)))))
+
+#define XAI_DSEL_16B_ROTATE_RIGHT_2_1  IVP_AVGU2NX8(IVP_SEQ2NX8(), IVP_MOV2NX8_FROMNX16(2 * (1 + ((1 + 1) << 8))))
+#define XAI_DSEL_16B_ROTATE_RIGHT_4_3  IVP_AVGU2NX8(IVP_SEQ2NX8(), IVP_MOV2NX8_FROMNX16(2 * (3 + ((3 + 1) << 8))))
+#define XAI_DSEL_32B_ROTATE_RIGHT_2_1  IVP_AVGU2NX8(IVP_SEQ2NX8(), IVP_MOV2NX8_FROMNX16(4 * (1 + ((1 + 1) << 8))))
+#define XAI_DSEL_32B_ROTATE_RIGHT_4_3  IVP_AVGU2NX8(IVP_SEQ2NX8(), IVP_MOV2NX8_FROMNX16(4 * (3 + ((3 + 1) << 8))))
+#endif
+
+#define OFFSET_PTR_NX8(ptr, nrows, stride, in_row_offset)    ((xb_vecNx8 *)     ((int8_t *)  (ptr) + (in_row_offset) + (nrows) * (stride)))
+#define OFFSET_PTR_NX8U(ptr, nrows, stride, in_row_offset)   ((xb_vecNx8U *)    ((uint8_t *) (ptr) + (in_row_offset) + (nrows) * (stride)))
+#define OFFSET_PTR_2NX8(ptr, nrows, stride, in_row_offset)   ((xb_vec2Nx8 *)    ((int8_t *)  (ptr) + (in_row_offset) + (nrows) * (stride)))
+#define OFFSET_PTR_2NX8U(ptr, nrows, stride, in_row_offset)  ((xb_vec2Nx8U *) ((uint8_t *) (ptr) + (in_row_offset) + (nrows) * (stride)))
+#define OFFSET_PTR_NX16(ptr, nrows, stride, in_row_offset)   ((xb_vecNx16 *)    ((int16_t *) (ptr) + (in_row_offset) + (nrows) * (stride)))
+#define OFFSET_PTR_NX16U(ptr, nrows, stride, in_row_offset)  ((xb_vecNx16U *) ((uint16_t *) (ptr) + (in_row_offset) + (nrows) * (stride)))
+#endif
diff --git a/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_api_common.h b/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_api_common.h
new file mode 100644
index 00000000000..afcf5b87786
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_api_common.h
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2021 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#ifndef __XAI_CNN_API_COMMON_H__
+#define __XAI_CNN_API_COMMON_H__
+
+#include "xai_cnn_api_params.h"
+#include "xai_config_api.h"
+#include "xai_core_api.h"
+#include "xai_tile_manager.h"
+#include <math.h>
+#include <stdbool.h>
+
+
+// ElementWise APIs
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_AV(const xai_pTile3D inTile1,
+                                          const xai_pTile3D inTile2,
+                                          xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_S8_AV(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_U8_AV(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_S16_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_U16_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_S32_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_U32_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_AV(const xai_pTile3D inTile1,
+                                          const xai_pTile3D inTile2,
+                                          xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_S8_AV(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_U8_AV(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_S16_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_U16_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_S32_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_U32_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_S32_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_AV(const xai_pTile3D inTile1,
+                                          const xai_pTile3D inTile2,
+                                          xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_S8_AV(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_U8_AV(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_S16_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_U16_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_S32_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_U32_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_AV(const xai_pTile3D inTile1,
+                                          const xai_pTile3D inTile2,
+                                          xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_S8_AV(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_U8_AV(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_S16_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_U16_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_S32_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_U32_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseOr3D_AV(const xai_pTile3D inTile1,
+                                         const xai_pTile3D inTile2,
+                                         xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseOr3D_S8_AV(const xai_pTile3D inTile1,
+                                            const xai_pTile3D inTile2,
+                                            xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseOr3D_U8_AV(const xai_pTile3D inTile1,
+                                            const xai_pTile3D inTile2,
+                                            xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseOr3D_S16_AV(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseOr3D_U16_AV(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseOr3D_S32_AV(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseOr3D_U32_AV(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile);
+
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAnd3D_AV(const xai_pTile3D inTile1,
+                                          const xai_pTile3D inTile2,
+                                          xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAnd3D_S8_AV(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAnd3D_U8_AV(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAnd3D_S16_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAnd3D_U16_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAnd3D_S32_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAnd3D_U32_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseXor3D_AV(const xai_pTile3D inTile1,
+                                          const xai_pTile3D inTile2,
+                                          xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseXor3D_S8_AV(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseXor3D_U8_AV(const xai_pTile3D inTile1,
+                                             const xai_pTile3D inTile2,
+                                             xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseXor3D_S16_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseXor3D_U16_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseXor3D_S32_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseXor3D_U32_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_AV(const xai_pTile3D inTile1,
+                                            const xai_pTile3D inTile2,
+                                            xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_S8_AV(const xai_pTile3D inTile1,
+                                               const xai_pTile3D inTile2,
+                                               xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_U8_AV(const xai_pTile3D inTile1,
+                                               const xai_pTile3D inTile2,
+                                               xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_S16_AV(const xai_pTile3D inTile1,
+                                                const xai_pTile3D inTile2,
+                                                xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_U16_AV(const xai_pTile3D inTile1,
+                                                const xai_pTile3D inTile2,
+                                                xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_S32_AV(const xai_pTile3D inTile1,
+                                                const xai_pTile3D inTile2,
+                                                xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_U32_AV(const xai_pTile3D inTile1,
+                                                const xai_pTile3D inTile2,
+                                                xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_AV(const xai_pTile3D inTile1,
+                                                  const xai_pTile3D inTile2,
+                                                  xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_S8_AV(const xai_pTile3D inTile1,
+                                                     const xai_pTile3D inTile2,
+                                                     xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_U8_AV(const xai_pTile3D inTile1,
+                                                     const xai_pTile3D inTile2,
+                                                     xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_S16_AV(const xai_pTile3D inTile1,
+                                                      const xai_pTile3D inTile2,
+                                                      xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_U16_AV(const xai_pTile3D inTile1,
+                                                      const xai_pTile3D inTile2,
+                                                      xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_S32_AV(const xai_pTile3D inTile1,
+                                                      const xai_pTile3D inTile2,
+                                                      xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_U32_AV(const xai_pTile3D inTile1,
+                                                      const xai_pTile3D inTile2,
+                                                      xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_AV(const xai_pTile3D inTile1,
+                                               const xai_pTile3D inTile2,
+                                               xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_S8_AV(const xai_pTile3D inTile1,
+                                                  const xai_pTile3D inTile2,
+                                                  xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_U8_AV(const xai_pTile3D inTile1,
+                                                  const xai_pTile3D inTile2,
+                                                  xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_S16_AV(const xai_pTile3D inTile1,
+                                                   const xai_pTile3D inTile2,
+                                                   xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_U16_AV(const xai_pTile3D inTile1,
+                                                   const xai_pTile3D inTile2,
+                                                   xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_S32_AV(const xai_pTile3D inTile1,
+                                                   const xai_pTile3D inTile2,
+                                                   xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_U32_AV(const xai_pTile3D inTile1,
+                                                   const xai_pTile3D inTile2,
+                                                   xai_pTile3D outTile);
+
+
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_F16_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_F16_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_F16_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_F16_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_F16_AV(const xai_pTile3D inTile1,
+                                                const xai_pTile3D inTile2,
+                                                xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_F16_AV(const xai_pTile3D inTile1,
+                                                      const xai_pTile3D inTile2,
+                                                      xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_F16_AV(const xai_pTile3D inTile1,
+                                                   const xai_pTile3D inTile2,
+                                                   xai_pTile3D outTile);
+#endif //#if XCHAL_HAVE_VISION_HP_VFPU == 1
+
+
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_F32_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_F32_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_F32_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_F32_AV(const xai_pTile3D inTile1,
+                                              const xai_pTile3D inTile2,
+                                              xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_F32_AV(const xai_pTile3D inTile1,
+                                                const xai_pTile3D inTile2,
+                                                xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_F32_AV(const xai_pTile3D inTile1,
+                                                      const xai_pTile3D inTile2,
+                                                      xai_pTile3D outTile);
+
+
+_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_F32_AV(const xai_pTile3D inTile1,
+                                                   const xai_pTile3D inTile2,
+                                                   xai_pTile3D outTile);
+#endif //#if XCHAL_HAVE_VISION_SP_VFPU == 1
+
+_XAI_API_ XAI_ERR_TYPE xaiCast3D(const xai_pTile3D inTile,
+                                 xai_pTile3D outTile);
+#endif //#ifndef __XAI_CNN_API_COMMON_H__
diff --git a/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_api_params.h b/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_api_params.h
new file mode 100644
index 00000000000..51d4cb75358
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_api_params.h
@@ -0,0 +1,1886 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#ifndef __XAI_CNN_API_PARAMS_H__
+#define __XAI_CNN_API_PARAMS_H__
+
+#include "xai_config_api.h"
+#include "xai_core_api.h"
+#include "xai_tile_manager.h"
+#include <math.h>
+#include <stdbool.h>
+
+#define TFL_QUANTIZATION_MODE_BIT_EXACT    1
+#define TFL_QUANTIZATION_MODE_APPROXIMATE  2
+#define XNNC_QUANTIZATION_MODE             3
+#define TFL_USE_ACT_TIE                    4
+
+#ifndef FLT_MIN
+#define FLT_MIN  (1.175494351e-38F)
+#endif
+
+#ifndef FLT_MAX
+#define FLT_MAX  (3.402823466e+38F)
+#endif
+
+
+#if defined(__clang__) && (defined(GLOW_BUILD) || defined(GLOW_WITH_XTENSA))
+
+#ifdef XCHAL_HAVE_VISION_HP_VFPU
+#undef XCHAL_HAVE_VISION_HP_VFPU
+#define XCHAL_HAVE_VISION_HP_VFPU  1
+#endif
+
+#ifdef XCHAL_IVPN_SIMD_WIDTH
+#if (XCHAL_IVPN_SIMD_WIDTH == 64)
+#define XCHAL_HAVE_CONNX_B_HP_VFPU  1
+#define XCHAL_HAVE_VISION_SP_VFPU   1
+#define XCHAL_HAVE_BBENEP_SP_VFPU   1
+#endif
+#endif
+
+#include <math.h>
+
+#if (XCHAL_HAVE_VISION_HP_VFPU == 1)
+# undef ENABLE_F16_PRECISION
+# define ENABLE_F16_PRECISION  1
+#endif
+
+#ifdef BIT_EXACT_FP16_REF
+#  undef BIT_EXACT_FP16_REF
+#endif
+
+#ifdef BIT_EXACT_FP32_REF
+#  undef BIT_EXACT_FP32_REF
+#endif
+
+#include "fp16.h"
+#include <cstdint>
+#include "shared/Common/Float16.h"
+#undef xb_f16
+typedef shared::float16  xb_f16;
+
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+
+#include <math.h>
+#undef ENABLE_F32_PRECISION
+#define ENABLE_F32_PRECISION  1
+
+#ifdef BIT_EXACT_FP32_REF
+#  undef BIT_EXACT_FP32_REF
+#endif
+#endif // #if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+
+// MLIR builds cannot use the contents of these include files, but they
+// currently do not need the symbols defined in them.
+#elif !defined(MLIR_BUILD)
+#ifndef XAI_REF_ONLY_COMPILATION
+#include <xtensa/tie/xt_misc.h>
+#if (XCHAL_HAVE_HIFI1 || XCHAL_HAVE_HIFI3Z || XCHAL_HAVE_HIFI4 || XCHAL_HAVE_HIFI5)
+#include <xtensa/tie/xt_hifi2.h>
+#else
+#include <xtensa/tie/xt_ivpn.h>
+#endif
+#endif
+#if (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION) && !defined(GENERIC_XTENSA_BUILD))
+typedef _Float16  xb_f16;
+#elif defined(GENERIC_BUILD)
+typedef float xb_f16;
+#endif
+#elif defined(MLIR_BUILD) && defined(XAI_REF_ONLY_COMPILATION)
+typedef float xb_f16;
+#endif // #if defined(__clang__) && (defined(GLOW_BUILD) || defined(GLOW_WITH_XTENSA))
+
+#if defined (BIT_EXACT_FP16_REF)
+#undef XAI_F16_half
+#define XAI_F16_half  IVP_CVTF16F32(0.5f)
+#else
+#undef XAI_F16_half
+#define XAI_F16_half  (xb_f16) (0.5f)
+#endif
+
+#define XAI_F16_MIN_FLT  (float) (-65504.0f)
+#define XAI_F16_MAX_FLT  (float) (65504.0f)
+#define XAI_F32_MIN_FLT  (float) (-FLT_MAX)
+#define XAI_F32_MAX_FLT  (float) (FLT_MAX)
+
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+#define XAI_F16_MIN         (xb_f16) (-65504.0f)
+#define XAI_F16_MAX         (xb_f16) (65504.0f)
+#define XAI_F16_MIN_VECN    (xb_vecNxf16) (-65504.0f)
+#define XAI_F16_MAX_VECN    (xb_vecNxf16) (65504.0f)
+#define XAI_F16_MIN_VECN32  (xb_vecN_2xf32) (-65504.0f)
+#define XAI_F16_MAX_VECN32  (xb_vecN_2xf32) (65504.0f)
+#define XAI_F16_POS_MIN     (xb_f16) (6.10352e-5F)
+#endif
+
+/***************************************************************************************/
+/* log2 function is not defined in Visual Studio 2012 but available in higher versions */
+/* _MSC_VER version number check to be performed for visual studio version             */
+/* If _MSC_VER <= (Visual Studio 2012) version log2 function  is enabled               */
+/* Visual Studio Version Information :                                                 */
+/* MSVC++ 14.0 _MSC_VER == 1900 (Visual Studio 2015)                                   */
+/* MSVC++ 12.0 _MSC_VER == 1800 (Visual Studio 2013)                                   */
+/* MSVC++ 11.0 _MSC_VER == 1700 (Visual Studio 2012)                                   */
+/* MSVC++ 10.0 _MSC_VER == 1600 (Visual Studio 2010)                                   */
+/* MSVC++ 9.0  _MSC_VER == 1500 (Visual Studio 2008)                                   */
+/* MSVC++ 8.0  _MSC_VER == 1400 (Visual Studio 2005)                                   */
+/***************************************************************************************/
+
+#if defined(_MSC_VER)
+#if _MSC_VER <= 1700
+#include "math.h"
+static _XAI_INLINE_ double log2(double number)
+{
+  /* Calculates log2 of number.  */
+  return(log(number) / log(2.0));
+}
+#endif
+#endif
+
+#define CNN_CONV_FLAG_RELU                              (1 << 0)
+#define CNN_CONV_FLAG_LEFTEDGE                          (1 << 1)
+#define CNN_CONV_FLAG_TOPEDGE                           (1 << 2)
+#define CNN_CONV_FLAG_INPUT                             (1 << 3)
+#define CNN_CONV_FLAG_OUTPUT                            (1 << 4)
+
+#define CNN_POOLING_TOPEDGE_FLAG                        (1 << 1)
+#define CNN_POOLING_LEFTEDGE_FLAG                       (1 << 0)
+
+#define CNN_NORMALIZE_ALONG_WIDTH                       (1 << 0)
+#define CNN_NORMALIZE_ALONG_HEIGHT                      (1 << 1)
+#define CNN_NORMALIZE_ALONG_DEPTH                       (1 << 2)
+#define CNN_NORMALIZE_ALONG_BATCH                       (1 << 3)
+#define CNN_NORMALIZE_ALONG_WIDTH_AND_HEIGHT            (CNN_NORMALIZE_ALONG_WIDTH | CNN_NORMALIZE_ALONG_HEIGHT)
+#define CNN_NORMALIZE_ALONG_WIDTH_AND_HEIGHT_AND_DEPTH  (CNN_NORMALIZE_ALONG_WIDTH | CNN_NORMALIZE_ALONG_HEIGHT | CNN_NORMALIZE_ALONG_DEPTH)
+#define CNN_NORMALIZE_CHANNEL_SHARE_FLAG                (1 << 0)
+
+#define CNN_GLOBAL_POOL_INTERMEDIATE_TILE               0
+#define CNN_GLOBAL_POOL_FIRST_TILE                      1
+#define CNN_GLOBAL_POOL_LAST_TILE                       2
+#define CNN_GLOBAL_POOL_FIRST_AND_LAST_TILE             3
+
+#define CNN_NORMALIZE_INTERMEDIATE_TILE                 0
+#define CNN_NORMALIZE_FIRST_TILE                        1
+#define CNN_NORMALIZE_LAST_TILE                         2
+#define CNN_NORMALIZE_FIRST_AND_LAST_TILE               3
+#define CNN_EXP_LUT_PARTITION                           3
+
+typedef struct
+{
+  float   widthScale;
+  float   heightScale;
+  float   xshift;
+  float   yshift;
+  int8_t  alignCorners;
+  int8_t  halfPixelCenters;
+  int32_t zeroPtInput;
+  int32_t zeroPtOutput;
+  int32_t outMultiplier;
+  int32_t outShift;
+  int32_t widthFrame;
+  int32_t heightFrame;
+  int8_t  quantization_mode;
+} xai_cnn_resizeA3D_params;
+
+#define XAI_CNN_RESIZE3D_GET_WIDTHSCALE(x)                  ((x)->widthScale)
+#define XAI_CNN_RESIZE3D_GET_HEIGHTSCALE(x)                 ((x)->heightScale)
+#define XAI_CNN_RESIZE3D_GET_XSHIFT(x)                      ((x)->xshift)
+#define XAI_CNN_RESIZE3D_GET_YSHIFT(x)                      ((x)->yshift)
+#define XAI_CNN_RESIZE3D_GET_FLAG_ALIGN_CORNERS(x)          ((x)->alignCorners)
+#define XAI_CNN_RESIZE3D_GET_FLAG_HALF_PIXEL_CENTERS(x)     ((x)->halfPixelCenters)
+#define XAI_CNN_RESIZE3D_GET_ZERO_POINT_INPUT(x)            ((x)->zeroPtInput)
+#define XAI_CNN_RESIZE3D_GET_ZERO_POINT_OUTPUT(x)           ((x)->zeroPtOutput)
+#define XAI_CNN_RESIZE3D_GET_OUT_MULTIPLIER(x)              ((x)->outMultiplier)
+#define XAI_CNN_RESIZE3D_GET_OUT_SHIFT(x)                   ((x)->outShift)
+#define XAI_CNN_RESIZE3D_GET_WIDTHFRAME(x)                  ((x)->widthFrame)
+#define XAI_CNN_RESIZE3D_GET_HEIGHTFRAME(x)                 ((x)->heightFrame)
+
+#define XAI_CNN_RESIZE3D_SET_WIDTHSCALE(x, v)               ((x)->widthScale = (v))
+#define XAI_CNN_RESIZE3D_SET_HEIGHTSCALE(x, v)              ((x)->heightScale = (v))
+#define XAI_CNN_RESIZE3D_SET_XSHIFT(x, v)                   ((x)->xshift = (v))
+#define XAI_CNN_RESIZE3D_SET_YSHIFT(x, v)                   ((x)->yshift = (v))
+#define XAI_CNN_RESIZE3D_SET_FLAG_ALIGN_CORNERS(x, v)       ((x)->alignCorners = v)
+#define XAI_CNN_RESIZE3D_SET_FLAG_HALF_PIXEL_CENTERS(x, v)  ((x)->halfPixelCenters = v)
+#define XAI_CNN_RESIZE3D_SET_ZERO_POINT_INPUT(x, v)         ((x)->zeroPtInput = (v))
+#define XAI_CNN_RESIZE3D_SET_ZERO_POINT_OUTPUT(x, v)        ((x)->zeroPtOutput = (v))
+#define XAI_CNN_RESIZE3D_SET_OUT_MULTIPLIER(x, v)           ((x)->outMultiplier = (v))
+#define XAI_CNN_RESIZE3D_SET_OUT_SHIFT(x, v)                ((x)->outShift = (v))
+#define XAI_CNN_RESIZE3D_SET_WIDTHFRAME(x, v)               ((x)->widthFrame = (v))
+#define XAI_CNN_RESIZE3D_SET_HEIGHTFRAME(x, v)              ((x)->heightFrame = (v))
+
+#define XAI_CNN_RESIZE3D_GET_QUANTIZATION_MODE(x)           ((x)->quantization_mode)
+#define XAI_CNN_RESIZE3D_SET_QUANTIZATION_MODE(x, v)        ((x)->quantization_mode = (v))
+
+typedef struct
+{
+  uint8_t  strideX;                // Convolution StrideX
+  uint8_t  strideY;                // Convolution StrideY
+  uint8_t  accumShift;             // Accumulator Shift - Shift to convert accumulator data to 16 bit
+  uint16_t outputScale;            // Amount by which shifted data is scaled
+  uint8_t  outputShift;            // Shift amount to convert the scaled data to 16 bit
+  uint8_t  flags;
+  /*
+   *  --------------------------------------------------------------------------
+   *  |bit 7 - 5|    bit 4     | bit 3       | bit2      | bit1       | bit0     |
+   *  | unused  |FC output flag|FC input flag|topEdgeFlag|leftEdgeFlag|Relu Flag |
+   *  --------------------------------------------------------------------------
+   */
+  uint8_t dilationX;   // dilation along kernel width
+  uint8_t dilationY;   // dilation along kernel height
+  int32_t reluMin;     // Minimum clamping limit when bit 0 of flags is set
+  int32_t reluMax;     // Maximum clamping limit when bit 0 of flags is set
+  int8_t  quantization_mode;
+  int32_t input_offset;
+  int32_t output_offset;
+  int32_t coeff_offset;
+  int32_t outputScaleTFL;
+  int32_t outputShiftTFL;
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+  xb_f16  reluMinFlt;
+  xb_f16  reluMaxFlt;
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+  float reluMinFlt32;
+  float reluMaxFlt32;
+#endif
+} xai_cnn_conv_params;
+
+#define XAI_CNN_CONV_GET_STRIDE(x)               ((x)->strideX)
+#define XAI_CNN_CONV_SET_STRIDE(x, v)            (x)->strideX = (v); (x)->strideY = (v);
+#define XAI_CNN_CONV_GET_STRIDEX(x)              ((x)->strideX)
+#define XAI_CNN_CONV_GET_STRIDEY(x)              ((x)->strideY)
+#define XAI_CNN_CONV_SET_STRIDE_XY(x, v1, v2)    (x)->strideX = (v1); (x)->strideY = (v2);
+#define XAI_CNN_CONV_SET_STRIDEX(x, v)           (x)->strideX = (v);
+#define XAI_CNN_CONV_SET_STRIDEY(x, v)           (x)->strideY = (v);
+#define XAI_CNN_CONV_GET_ACCUM_SHIFT(x)          ((x)->accumShift)
+#define XAI_CNN_CONV_SET_ACCUM_SHIFT(x, v)       ((x)->accumShift = (v))
+#define XAI_CNN_CONV_GET_OUTPUT_SCALE(x)         ((x)->outputScale)
+#define XAI_CNN_CONV_SET_OUTPUT_SCALE(x, v)      ((x)->outputScale = (v))
+#define XAI_CNN_CONV_GET_OUTPUT_SHIFT(x)         ((x)->outputShift)
+#define XAI_CNN_CONV_SET_OUTPUT_SHIFT(x, v)      ((x)->outputShift = (v))
+#define XAI_CNN_CONV_GET_FLAGS(x)                ((x)->flags)
+#define XAI_CNN_CONV_SET_FLAGS(x, v)             ((x)->flags = (v))
+#define XAI_CNN_CONV_GET_FLAG_RELU(x)            ((x)->flags & CNN_CONV_FLAG_RELU)
+#define XAI_CNN_CONV_SET_FLAG_RELU(x)            ((x)->flags = ((x)->flags | CNN_CONV_FLAG_RELU))
+#define XAI_CNN_CONV_RESET_FLAG_RELU(x)          ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_RELU))
+#define XAI_CNN_CONV_GET_FLAG_LEFTEDGE(x)        ((x)->flags & CNN_CONV_FLAG_LEFTEDGE)
+#define XAI_CNN_CONV_SET_FLAG_LEFTEDGE(x)        ((x)->flags = ((x)->flags | CNN_CONV_FLAG_LEFTEDGE))
+#define XAI_CNN_CONV_RESET_FLAG_LEFTEDGE(x)      ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_LEFTEDGE))
+#define XAI_CNN_CONV_GET_FLAG_TOPEDGE(x)         ((x)->flags & CNN_CONV_FLAG_TOPEDGE)
+#define XAI_CNN_CONV_SET_FLAG_TOPEDGE(x)         ((x)->flags = ((x)->flags | CNN_CONV_FLAG_TOPEDGE))
+#define XAI_CNN_CONV_RESET_FLAG_TOPEDGE(x)       ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_TOPEDGE))
+#define XAI_CNN_CONV_GET_FLAG_INPUT(x)           ((x)->flags & CNN_CONV_FLAG_INPUT)
+#define XAI_CNN_CONV_SET_FLAG_INPUT(x)           ((x)->flags = ((x)->flags | CNN_CONV_FLAG_INPUT))
+#define XAI_CNN_CONV_RESET_FLAG_INPUT(x)         ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_INPUT))
+#define XAI_CNN_CONV_GET_FLAG_OUTPUT(x)          ((x)->flags & CNN_CONV_FLAG_OUTPUT)
+#define XAI_CNN_CONV_SET_FLAG_OUTPUT(x)          ((x)->flags = ((x)->flags | CNN_CONV_FLAG_OUTPUT))
+#define XAI_CNN_CONV_RESET_FLAG_OUTPUT(x)        ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_OUTPUT))
+#define XAI_CNN_CONV_GET_DILATION(x)             ((x)->dilationX)
+#define XAI_CNN_CONV_SET_DILATION(x, v)          (x)->dilationX = (v); (x)->dilationY = (v);
+#define XAI_CNN_CONV_GET_DILATIONX(x)            ((x)->dilationX)
+#define XAI_CNN_CONV_SET_DILATIONX(x, v)         ((x)->dilationX = (v))
+#define XAI_CNN_CONV_GET_DILATIONY(x)            ((x)->dilationY)
+#define XAI_CNN_CONV_SET_DILATIONY(x, v)         ((x)->dilationY = (v))
+#define XAI_CNN_CONV_SET_DILATION_XY(x, v1, v2)  (x)->dilationX = (v1); (x)->dilationY = (v2);
+#define XAI_CNN_CONV_GET_RELU_MIN(x)             ((x)->reluMin)
+#define XAI_CNN_CONV_SET_RELU_MIN(x, v)          ((x)->reluMin = (v))
+#define XAI_CNN_CONV_GET_RELU_MAX(x)             ((x)->reluMax)
+#define XAI_CNN_CONV_SET_RELU_MAX(x, v)          ((x)->reluMax = (v))
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+#define XAI_CNN_CONV_GET_RELU_MIN_FLT(x)         ((x)->reluMinFlt)
+#define XAI_CNN_CONV_SET_RELU_MIN_FLT(x, v)      ((x)->reluMinFlt = (v))
+#define XAI_CNN_CONV_GET_RELU_MAX_FLT(x)         ((x)->reluMaxFlt)
+#define XAI_CNN_CONV_SET_RELU_MAX_FLT(x, v)      ((x)->reluMaxFlt = (v))
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+#define XAI_CNN_CONV_GET_RELU_MIN_FLT32(x)        ((x)->reluMinFlt32)
+#define XAI_CNN_CONV_SET_RELU_MIN_FLT32(x, v)     ((x)->reluMinFlt32 = (v))
+#define XAI_CNN_CONV_GET_RELU_MAX_FLT32(x)        ((x)->reluMaxFlt32)
+#define XAI_CNN_CONV_SET_RELU_MAX_FLT32(x, v)     ((x)->reluMaxFlt32 = (v))
+#endif
+#define XAI_CNN_CONV_GET_QUANTIZATION_MODE(x)     ((x)->quantization_mode)
+#define XAI_CNN_CONV_SET_QUANTIZATION_MODE(x, v)  ((x)->quantization_mode = (v))
+#define XAI_CNN_CONV_GET_INPUT_OFFSET(x)          ((x)->input_offset)
+#define XAI_CNN_CONV_SET_INPUT_OFFSET(x, v)       ((x)->input_offset = (v))
+#define XAI_CNN_CONV_GET_OUTPUT_OFFSET(x)         ((x)->output_offset)
+#define XAI_CNN_CONV_SET_OUTPUT_OFFSET(x, v)      ((x)->output_offset = (v))
+#define XAI_CNN_CONV_GET_COEFF_OFFSET(x)          ((x)->coeff_offset)
+#define XAI_CNN_CONV_SET_COEFF_OFFSET(x, v)       ((x)->coeff_offset = (v))
+#define XAI_CNN_CONV_GET_OUTPUT_SCALE_TFL(x)      ((x)->outputScaleTFL)
+#define XAI_CNN_CONV_SET_OUTPUT_SCALE_TFL(x, v)   ((x)->outputScaleTFL = (v))
+#define XAI_CNN_CONV_GET_OUTPUT_SHIFT_TFL(x)      ((x)->outputShiftTFL)
+#define XAI_CNN_CONV_SET_OUTPUT_SHIFT_TFL(x, v)   ((x)->outputShiftTFL = (v))
+
+typedef struct
+{
+  uint8_t  strideX;                // Convolution StrideX
+  uint8_t  strideY;                // Convolution StrideY
+  uint8_t  accumShift;             // Accumulator Shift - Shift to convert accumulator data to 16 bit
+  uint16_t outputScale;            // Amount by which shifted data is scaled
+  uint8_t  outputShift;            // Shift amount to convert the scaled data to 16 bit
+  uint8_t  flags;
+  /*
+   *  --------------------------------------------------------------------------
+   *  |bit 7 - 5|    bit 4     | bit 3       | bit2      | bit1       | bit0     |
+   *  | unused  |FC output flag|FC input flag|topEdgeFlag|leftEdgeFlag|Relu Flag |
+   *  --------------------------------------------------------------------------
+   */
+  uint8_t dilationX;              // dilation along kernel width
+  uint8_t dilationY;              // dilation along kernel height
+  uint8_t depthMultiplier;        // factor by which output depth size varies from input depth size
+  int32_t reluMin;                // Minimum clamping limit when bit 0 of flags is set
+  int32_t reluMax;                // Maximum clamping limit when bit 0 of flags is set
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+  xb_f16  reluMinFlt;
+  xb_f16  reluMaxFlt;
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+  float   reluMinFlt32;
+  float   reluMaxFlt32;
+#endif
+  int8_t  quantization_mode;
+  int32_t input_offset;
+  int32_t output_offset;
+} xai_cnn_depthwiseDilatedConv_params;
+
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(x)               ((x)->strideX)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_STRIDE(x, v)            (x)->strideX = (v); (x)->strideY = (v)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(x)              ((x)->strideX)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(x)              ((x)->strideY)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_STRIDE_XY(x, v1, v2)    (x)->strideX = (v1); (x)->strideY = (v2)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_STRIDEX(x, v)           (x)->strideX = (v);
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_STRIDEY(x, v)           (x)->strideY = (v);
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_ACCUM_SHIFT(x)          ((x)->accumShift)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_ACCUM_SHIFT(x, v)       ((x)->accumShift = (v))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_OUTPUT_SCALE(x)         ((x)->outputScale)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_OUTPUT_SCALE(x, v)      ((x)->outputScale = (v))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_OUTPUT_SHIFT(x)         ((x)->outputShift)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_OUTPUT_SHIFT(x, v)      ((x)->outputShift = (v))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAGS(x)                ((x)->flags)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_FLAGS(x, v)             ((x)->flags = (v))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_RELU(x)            ((x)->flags & CNN_CONV_FLAG_RELU)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_FLAG_RELU(x)            ((x)->flags = ((x)->flags | CNN_CONV_FLAG_RELU))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_RESET_FLAG_RELU(x)          ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_RELU))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_LEFTEDGE(x)        ((x)->flags & CNN_CONV_FLAG_LEFTEDGE)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_FLAG_LEFTEDGE(x)        ((x)->flags = ((x)->flags | CNN_CONV_FLAG_LEFTEDGE))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_RESET_FLAG_LEFTEDGE(x)      ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_LEFTEDGE))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_TOPEDGE(x)         ((x)->flags & CNN_CONV_FLAG_TOPEDGE)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_FLAG_TOPEDGE(x)         ((x)->flags = ((x)->flags | CNN_CONV_FLAG_TOPEDGE))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_RESET_FLAG_TOPEDGE(x)       ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_TOPEDGE))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_INPUT(x)           ((x)->flags & CNN_CONV_FLAG_INPUT)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_FLAG_INPUT(x)           ((x)->flags = ((x)->flags | CNN_CONV_FLAG_INPUT))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_RESET_FLAG_INPUT(x)         ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_INPUT))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_OUTPUT(x)          ((x)->flags & CNN_CONV_FLAG_OUTPUT)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_FLAG_OUTPUT(x)          ((x)->flags = ((x)->flags | CNN_CONV_FLAG_OUTPUT))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_RESET_FLAG_OUTPUT(x)        ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_OUTPUT))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(x)             ((x)->dilationX)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_DILATION(x, v)          (x)->dilationX = (v); (x)->dilationY = (v)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(x)            ((x)->dilationX)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_DILATIONX(x, v)         ((x)->dilationX = (v))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(x)            ((x)->dilationY)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_DILATIONY(x, v)         ((x)->dilationY = (v))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_DILATION_XY(x, v1, v2)  (x)->dilationX = (v1); (x)->dilationY = (v2)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DEPTH_MULTIPLIER(x)     ((x)->depthMultiplier)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_DEPTH_MULTIPLIER(x, v)  ((x)->depthMultiplier = (v))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(x)             ((x)->reluMin)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_RELU_MIN(x, v)          ((x)->reluMin = (v))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(x)             ((x)->reluMax)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_RELU_MAX(x, v)          ((x)->reluMax = (v))
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN_FLT(x)         ((x)->reluMinFlt)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_RELU_MIN_FLT(x, v)      ((x)->reluMinFlt = (v))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX_FLT(x)         ((x)->reluMaxFlt)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_RELU_MAX_FLT(x, v)      ((x)->reluMaxFlt = (v))
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN_FLT32(x)        ((x)->reluMinFlt32)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_RELU_MIN_FLT32(x, v)     ((x)->reluMinFlt32 = (v))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX_FLT32(x)        ((x)->reluMaxFlt32)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_RELU_MAX_FLT32(x, v)     ((x)->reluMaxFlt32 = (v))
+#endif
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_QUANTIZATION_MODE(x)     ((x)->quantization_mode)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_QUANTIZATION_MODE(x, v)  ((x)->quantization_mode = (v))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_INPUT_OFFSET(x)          ((x)->input_offset)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_INPUT_OFFSET(x, v)       ((x)->input_offset = (v))
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_OUTPUT_OFFSET(x)         ((x)->output_offset)
+#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_OUTPUT_OFFSET(x, v)      ((x)->output_offset = (v))
+
+typedef struct
+{
+  uint8_t kernelWidth;             // Normalization window width
+  uint8_t kernelHeight;            // Normalization window height
+  int16_t sigmaScale;              // Factor used to scale the sum of squares of data under the normalization window
+  uint8_t sigmaScaleShift;         // Shift to map the scaled sum of squares to LUT index
+  uint8_t outputShift;             // Output shift
+} xai_cnn_lrn_spatial_params;
+
+typedef struct
+{
+  uint8_t kernelDepth;             // Normalization window depth
+  int16_t sigmaScale;              // Factor used to scale the sum of squares of data under the normalization window
+  uint8_t sigmaScaleShift;         // Shift to map the scaled sum of squares to LUT index
+  uint8_t outputShift;             // Output shift
+} xai_cnn_lrn_depth_params;
+
+#define XAI_CNN_LRN_GET_KERNELWIDTH(x)         ((x)->kernelWidth)
+#define XAI_CNN_LRN_SET_KERNELWIDTH(x, v)      ((x)->kernelWidth = (v))
+#define XAI_CNN_LRN_GET_KERNELHEIGHT(x)        ((x)->kernelHeight)
+#define XAI_CNN_LRN_SET_KERNELHEIGHT(x, v)     ((x)->kernelHeight = (v))
+#define XAI_CNN_LRN_GET_KERNELDEPTH(x)         ((x)->kernelDepth)
+#define XAI_CNN_LRN_SET_KERNELDEPTH(x, v)      ((x)->kernelDepth = (v))
+#define XAI_CNN_LRN_GET_SIGMASCALE(x)          ((x)->sigmaScale)
+#define XAI_CNN_LRN_SET_SIGMASCALE(x, v)       ((x)->sigmaScale = (v))
+#define XAI_CNN_LRN_GET_SIGMASCALESHIFT(x)     ((x)->sigmaScaleShift)
+#define XAI_CNN_LRN_SET_SIGMASCALESHIFT(x, v)  ((x)->sigmaScaleShift = (v))
+#define XAI_CNN_LRN_GET_OUTPUTSHIFT(x)         ((x)->outputShift)
+#define XAI_CNN_LRN_SET_OUTPUTSHIFT(x, v)      ((x)->outputShift = (v))
+
+typedef struct
+{
+  int16_t kernelWidth;
+  int16_t kernelHeight;
+  uint8_t strideX;     // The number of points by which the pooling window
+                       // is shifted along X direction.
+  uint8_t strideY;     // The number of points by which the pooling window
+                       // is shifted along Y direction.
+  uint8_t edgeFlag;    // edgeFlag is applicable only for pooling with even kernel sizes. Least significant bit(LSB)
+                       // of the flag represents whether minimum left edge size required for pooling should be
+                       // greater than the minimum right edge size required. The bit adjacent to LSB decides whether
+                       // minimum top edge size required should be greater than minimum bottom edge size.
+  int16_t outputScale; // Normalizer value to be multiplied with sum of elements under the pooling window
+  uint8_t outputShift; // Shift to be applied on the normalized sum to obtain the average
+  int32_t fixUpInit;   // the fixUp term that is used to incorporte Zero Points
+  uint8_t enableRelu;
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+  xb_f16  reluMinFlt;
+  xb_f16  reluMaxFlt;
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+  float   reluMinFlt32;
+  float   reluMaxFlt32;
+#endif
+  int8_t  quantization_mode;
+  int32_t reluMin;
+  int32_t reluMax;
+} xai_cnn_pooling_params;
+
+#define XAI_CNN_POOLING_GET_KERNELWIDTH(x)        ((x)->kernelWidth)
+#define XAI_CNN_POOLING_SET_KERNELWIDTH(x, v)     ((x)->kernelWidth = (v))
+#define XAI_CNN_POOLING_GET_KERNELHEIGHT(x)       ((x)->kernelHeight)
+#define XAI_CNN_POOLING_SET_KERNELHEIGHT(x, v)    ((x)->kernelHeight = (v))
+#define XAI_CNN_POOLING_GET_STRIDE(x)             ((x)->strideX)
+#define XAI_CNN_POOLING_SET_STRIDE(x, v)          (x)->strideX = (v); (x)->strideY = (v);
+#define XAI_CNN_POOLING_GET_STRIDEX(x)            ((x)->strideX)
+#define XAI_CNN_POOLING_GET_STRIDEY(x)            ((x)->strideY)
+#define XAI_CNN_POOLING_SET_STRIDE_XY(x, v1, v2)  (x)->strideX = (v1); (x)->strideY = (v2);
+#define XAI_CNN_POOLING_SET_STRIDEX(x, v)         (x)->strideX = (v);
+#define XAI_CNN_POOLING_SET_STRIDEY(x, v)         (x)->strideY = (v);
+#define XAI_CNN_POOLING_GET_TOPEDGE_FLAG(x)       ((x)->edgeFlag & CNN_POOLING_TOPEDGE_FLAG)
+#define XAI_CNN_POOLING_SET_TOPEDGE_FLAG(x)       ((x)->edgeFlag = ((x)->edgeFlag | CNN_POOLING_TOPEDGE_FLAG))
+#define XAI_CNN_POOLING_RESET_TOPEDGE_FLAG(x)     ((x)->edgeFlag = ((x)->edgeFlag & ~CNN_POOLING_TOPEDGE_FLAG))
+#define XAI_CNN_POOLING_GET_LEFTEDGE_FLAG(x)      ((x)->edgeFlag & CNN_POOLING_LEFTEDGE_FLAG)
+#define XAI_CNN_POOLING_SET_LEFTEDGE_FLAG(x)      ((x)->edgeFlag = ((x)->edgeFlag | CNN_POOLING_LEFTEDGE_FLAG))
+#define XAI_CNN_POOLING_RESET_LEFTEDGE_FLAG(x)    ((x)->edgeFlag = ((x)->edgeFlag & ~CNN_POOLING_LEFTEDGE_FLAG))
+#define XAI_CNN_POOLING_GET_OUTPUTSCALE(x)        ((x)->outputScale)
+#define XAI_CNN_POOLING_SET_OUTPUTSCALE(x, v)     ((x)->outputScale = (v))
+#define XAI_CNN_POOLING_GET_OUTPUTSHIFT(x)        ((x)->outputShift)
+#define XAI_CNN_POOLING_SET_OUTPUTSHIFT(x, v)     ((x)->outputShift = (v))
+#define XAI_CNN_POOLING_GET_FIXUPINIT(x)          ((x)->fixUpInit)
+#define XAI_CNN_POOLING_SET_FIXUPINIT(x, v)       ((x)->fixUpInit = (v))
+#define XAI_CNN_POOLING_GET_RELUFLAG(x)           ((x)->enableRelu)
+#define XAI_CNN_POOLING_SET_RELUFLAG(x, v)        ((x)->enableRelu = (v))
+
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+#define XAI_CNN_POOLING_GET_RELUMINFLT(x)     ((x)->reluMinFlt)
+#define XAI_CNN_POOLING_SET_RELUMINFLT(x, v)  ((x)->reluMinFlt = (v))
+#define XAI_CNN_POOLING_GET_RELUMAXFLT(x)     ((x)->reluMaxFlt)
+#define XAI_CNN_POOLING_SET_RELUMAXFLT(x, v)  ((x)->reluMaxFlt = (v))
+#endif
+
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+#define XAI_CNN_POOLING_GET_RELU_MIN_FLT32(x)        ((x)->reluMinFlt32)
+#define XAI_CNN_POOLING_SET_RELU_MIN_FLT32(x, v)     ((x)->reluMinFlt32 = (v))
+#define XAI_CNN_POOLING_GET_RELU_MAX_FLT32(x)        ((x)->reluMaxFlt32)
+#define XAI_CNN_POOLING_SET_RELU_MAX_FLT32(x, v)     ((x)->reluMaxFlt32 = (v))
+#endif
+#define XAI_CNN_POOLING_GET_QUANTIZATION_MODE(x)     ((x)->quantization_mode)
+#define XAI_CNN_POOLING_SET_QUANTIZATION_MODE(x, v)  ((x)->quantization_mode = (v))
+#define XAI_CNN_POOLING_GET_RELUMIN(x)               ((x)->reluMin)
+#define XAI_CNN_POOLING_SET_RELUMIN(x, v)            ((x)->reluMin = (v))
+#define XAI_CNN_POOLING_GET_RELUMAX(x)               ((x)->reluMax)
+#define XAI_CNN_POOLING_SET_RELUMAX(x, v)            ((x)->reluMax = (v))
+
+typedef struct
+{
+  int16_t outputScale; //Normalizer value to be multiplied with sum of elements under the pooling window
+  uint8_t tileFlag;    // indicates whether the given tile is a first tile, last tile or neither of those
+  uint8_t outputShift; //Shift to be applied on the normalized sum to obtain the average
+  uint8_t accShift;    //accumulator shift that is applied to bring the data to S32 range
+  int32_t fixUpInit;   //the fixUp term that is used to incorporte Zero Points
+} xai_cnn_global_pooling_params;
+
+#define XAI_CNN_GLOBAL_POOLING_GET_OUTPUTSCALE(x)     ((x)->outputScale)
+#define XAI_CNN_GLOBAL_POOLING_SET_OUTPUTSCALE(x, v)  ((x)->outputScale = (v))
+#define XAI_CNN_GLOBAL_POOLING_GET_OUTPUTSHIFT(x)     ((x)->outputShift)
+#define XAI_CNN_GLOBAL_POOLING_SET_OUTPUTSHIFT(x, v)  ((x)->outputShift = (v))
+#define XAI_CNN_GLOBAL_POOLING_GET_ACCSHIFT(x)        ((x)->accShift)
+#define XAI_CNN_GLOBAL_POOLING_SET_ACCSHIFT(x, v)     ((x)->accShift = (v))
+#define XAI_CNN_GLOBAL_POOLING_GET_TILE_FLAG(x)       ((x)->tileFlag)
+#define XAI_CNN_GLOBAL_POOLING_SET_TILE_FLAG(x, v)    ((x)->tileFlag = (v))
+#define XAI_CNN_GLOBAL_POOLING_GET_FIXUPINIT(x)       ((x)->fixUpInit)
+#define XAI_CNN_GLOBAL_POOLING_SET_FIXUPINIT(x, v)    ((x)->fixUpInit = (v))
+
+typedef struct
+{
+  uint16_t spatialScaleX;           // Multiplicative spatial scale factor to translate ROI coords from their
+                                    // input scale to the scale used when pooling
+                                    //Spatial scale in the X direction
+  uint16_t spatialScaleY;           //Spatial scale in the Y direction
+  uint16_t spatialScaleShiftX;      //Shift value to apply for spatial scale operations in the X direction
+  uint16_t spatialScaleShiftY;      //Shift value to apply for spatial scale operations in the Y direction
+  int32_t  pooledHeight;            //Total number of fixed output points along height dimension from ROI
+  int32_t  pooledWidth;             //Total number of fixed output points along width dimension from ROI
+  uint16_t oneByPooledHeightScale;  //Reciprocal of pooledHeight represented in U15 range
+  uint16_t oneByPooledWidthScale;   //Reciprocal of pooledWidth represented in U15 range
+  uint16_t oneByPooledHeightShift;  //Shift value to normalize after operating with oneByPooledHeightScale variable
+  uint16_t oneByPooledWidthShift;   //Shift value to normalize after operating with oneByPooledWidthScale variable
+} xai_cnn_roi_pooling_params;
+
+#define XAI_CNN_ROI_POOLING_GET_SPATIAL_SCALEX(x)                 ((x)->spatialScaleX)
+#define XAI_CNN_ROI_POOLING_SET_SPATIAL_SCALEX(x, v)              ((x)->spatialScaleX = (v))
+#define XAI_CNN_ROI_POOLING_GET_SPATIAL_SCALEY(x)                 ((x)->spatialScaleY)
+#define XAI_CNN_ROI_POOLING_SET_SPATIAL_SCALEY(x, v)              ((x)->spatialScaleY = (v))
+#define XAI_CNN_ROI_POOLING_GET_SPATIAL_SCALE_SHIFTX(x)           ((x)->spatialScaleShiftX)
+#define XAI_CNN_ROI_POOLING_SET_SPATIAL_SCALE_SHIFTX(x, v)        ((x)->spatialScaleShiftX = (v))
+#define XAI_CNN_ROI_POOLING_GET_SPATIAL_SCALE_SHIFTY(x)           ((x)->spatialScaleShiftY)
+#define XAI_CNN_ROI_POOLING_SET_SPATIAL_SCALE_SHIFTY(x, v)        ((x)->spatialScaleShiftY = (v))
+#define XAI_CNN_ROI_POOLING_GET_POOLED_WIDTH(x)                   ((x)->pooledWidth)
+#define XAI_CNN_ROI_POOLING_SET_POOLED_WIDTH(x, v)                ((x)->pooledWidth = (v))
+#define XAI_CNN_ROI_POOLING_GET_POOLED_HEIGHT(x)                  ((x)->pooledHeight)
+#define XAI_CNN_ROI_POOLING_SET_POOLED_HEIGHT(x, v)               ((x)->pooledHeight = (v))
+#define XAI_CNN_ROI_POOLING_GET_ONE_BY_POOLED_WIDTH_SCALE(x)      ((x)->oneByPooledWidthScale)
+#define XAI_CNN_ROI_POOLING_SET_ONE_BY_POOLED_WIDTH_SCALE(x, v)   ((x)->oneByPooledWidthScale = (v))
+#define XAI_CNN_ROI_POOLING_GET_ONE_BY_POOLED_HEIGHT_SCALE(x)     ((x)->oneByPooledHeightScale)
+#define XAI_CNN_ROI_POOLING_SET_ONE_BY_POOLED_HEIGHT_SCALE(x, v)  ((x)->oneByPooledHeightScale = (v))
+#define XAI_CNN_ROI_POOLING_GET_ONE_BY_POOLED_WIDTH_SHIFT(x)      ((x)->oneByPooledWidthShift)
+#define XAI_CNN_ROI_POOLING_SET_ONE_BY_POOLED_WIDTH_SHIFT(x, v)   ((x)->oneByPooledWidthShift = (v))
+#define XAI_CNN_ROI_POOLING_GET_ONE_BY_POOLED_HEIGHT_SHIFT(x)     ((x)->oneByPooledHeightShift)
+#define XAI_CNN_ROI_POOLING_SET_ONE_BY_POOLED_HEIGHT_SHIFT(x, v)  ((x)->oneByPooledHeightShift = (v))
+
+typedef struct
+{
+  uint8_t outputShift;      /* No. of output bits to be right shifted. */
+  uint8_t qFactorOutput;    /* No. of bits scaling applied to the reciprocal of the sum of exp(x)*/
+  int16_t maxVal;           /* global max value in the 3D tile */
+  int8_t  axis;             /* dimension along which softmax is applied*/
+  int8_t  quantization_mode;
+  int32_t diff_min;          //defines minimum difference with respect to the maximum value
+  int32_t inputScale;        //significand of BetaScaleQ5.26
+  int32_t inputShift;        //exponent of BetaScaleQ5.26
+} xai_cnn_softmax_params;
+
+#define XAI_CNN_SOFTMAX_GET_OUTPUTSHIFT(x)           ((x)->outputShift)
+#define XAI_CNN_SOFTMAX_SET_OUTPUTSHIFT(x, v)        ((x)->outputShift = (v))
+#define XAI_CNN_SOFTMAX_GET_QFACTOROUTPUT(x)         ((x)->qFactorOutput)
+#define XAI_CNN_SOFTMAX_SET_QFACTOROUTPUT(x, v)      ((x)->qFactorOutput = (v))
+#define XAI_CNN_SOFTMAX_GET_MAXVAL(x)                ((x)->maxVal)
+#define XAI_CNN_SOFTMAX_SET_MAXVAL(x, v)             ((x)->maxVal = (v))
+#define XAI_CNN_SOFTMAX_GET_AXIS(x)                  ((x)->axis)
+#define XAI_CNN_SOFTMAX_SET_AXIS(x, v)               ((x)->axis = (v))
+#define XAI_CNN_SOFTMAX_GET_QUANTIZATION_MODE(x)     ((x)->quantization_mode)
+#define XAI_CNN_SOFTMAX_SET_QUANTIZATION_MODE(x, v)  ((x)->quantization_mode = (v))
+#define XAI_CNN_SOFTMAX_PARAMS_GET_DIFF_MIN(x)       ((x)->diff_min)
+#define XAI_CNN_SOFTMAX_PARAMS_SET_DIFF_MIN(x, v)    ((x)->diff_min = (v))
+#define XAI_CNN_SOFTMAX_GET_INPUT_SCALE(x)           ((x)->inputScale)
+#define XAI_CNN_SOFTMAX_SET_INPUT_SCALE(x, v)        ((x)->inputScale = (v))
+#define XAI_CNN_SOFTMAX_GET_INPUT_SHIFT(x)           ((x)->inputShift)
+#define XAI_CNN_SOFTMAX_SET_INPUT_SHIFT(x, v)        ((x)->inputShift = (v))
+
+typedef struct
+{
+  int8_t  quantization_mode;
+  // tfl related parameters
+  int32_t inputZeroPoint;
+  int32_t outputZeroPoint;
+  int16_t reluishMultiplierFixedpointS16;
+  int32_t reluishMultiplierExponent;
+  int16_t outputMultiplierFixedpointS16;
+  int32_t outputMultiplierExponent;
+} xai_cnn_tfl_hardSwish_params;
+
+#define XAI_CNN_HARDSWISH_GET_QUANTIZATION_MODE(x)                      ((x)->quantization_mode)
+#define XAI_CNN_HARDSWISH_SET_QUANTIZATION_MODE(x, v)                   ((x)->quantization_mode = (v))
+#define XAI_CNN_HARDSWISH_GET_INPUT_ZERO_POINT(x)                       ((x)->inputZeroPoint)
+#define XAI_CNN_HARDSWISH_SET_INPUT_ZERO_POINT(x, v)                    ((x)->inputZeroPoint = (v))
+#define XAI_CNN_HARDSWISH_GET_OUTPUT_ZERO_POINT(x)                      ((x)->outputZeroPoint)
+#define XAI_CNN_HARDSWISH_SET_OUTPUT_ZERO_POINT(x, v)                   ((x)->outputZeroPoint = (v))
+#define XAI_CNN_HARDSWISH_GET_RELUISH_MULTIPLIER_FIXED_POINT_S16(x)     ((x)->reluishMultiplierFixedpointS16)
+#define XAI_CNN_HARDSWISH_SET_RELUISH_MULTIPLIER_FIXED_POINT_S16(x, v)  ((x)->reluishMultiplierFixedpointS16 = (v))
+#define XAI_CNN_HARDSWISH_GET_RELUISH_MULTIPLIER_EXPONENT(x)            ((x)->reluishMultiplierExponent)
+#define XAI_CNN_HARDSWISH_SET_RELUISH_MULTIPLIER_EXPONENT(x, v)         ((x)->reluishMultiplierExponent = (v))
+#define XAI_CNN_HARDSWISH_GET_OUTPUT_MULTIPLIER_FIXED_POINT_S16(x)      ((x)->outputMultiplierFixedpointS16)
+#define XAI_CNN_HARDSWISH_SET_OUTOUT_MULTIPLIER_FIXED_POINT_S16(x, v)   ((x)->outputMultiplierFixedpointS16 = (v))
+#define XAI_CNN_HARDSWISH_GET_OUTPUT_MULTIPLIER_EXPONENT(x)             ((x)->outputMultiplierExponent)
+#define XAI_CNN_HARDSWISH_SET_OUTPUT_MULTIPLIER_EXPONENT(x, v)          ((x)->outputMultiplierExponent = (v))
+
+typedef struct
+{
+  int8_t  quantization_mode;
+  // tfl related parameters
+  int32_t inputRangeRadius;
+  int32_t inputScale;
+  int32_t inputShift;
+  int32_t inputZeroPoint;
+} xai_cnn_sigmoid_params;
+
+#define XAI_CNN_SIGMOID_GET_QUANTIZATION_MODE(x)      ((x)->quantization_mode)
+#define XAI_CNN_SIGMOID_SET_QUANTIZATION_MODE(x, v)   ((x)->quantization_mode = (v))
+#define XAI_CNN_SIGMOID_GET_INPUT_RANGE_RADIUS(x)     ((x)->inputRangeRadius)
+#define XAI_CNN_SIGMOID_SET_INPUT_RANGE_RADIUS(x, v)  ((x)->inputRangeRadius = (v))
+#define XAI_CNN_SIGMOID_GET_INPUT_SCALE(x)            ((x)->inputScale)
+#define XAI_CNN_SIGMOID_SET_INPUT_SCALE(x, v)         ((x)->inputScale = (v))
+#define XAI_CNN_SIGMOID_GET_INPUT_SHIFT(x)            ((x)->inputShift)
+#define XAI_CNN_SIGMOID_SET_INPUT_SHIFT(x, v)         ((x)->inputShift = (v))
+#define XAI_CNN_SIGMOID_GET_INPUT_ZERO_POINT(x)       ((x)->inputZeroPoint)
+#define XAI_CNN_SIGMOID_SET_INPUT_ZERO_POINT(x, v)    ((x)->inputZeroPoint = (v))
+
+typedef struct
+{
+  int8_t  quantization_mode;
+  // tfl related parameters
+  int32_t inputRangeRadius;
+  int32_t inputScale;
+  int32_t inputShift;
+  int32_t inputZeroPoint;
+  int32_t outputZeroPoint; //Hack in Glow to keep tanh and sigmoid params different
+} xai_cnn_tanh_params;
+
+#define XAI_CNN_TANH_GET_QUANTIZATION_MODE(x)      ((x)->quantization_mode)
+#define XAI_CNN_TANH_SET_QUANTIZATION_MODE(x, v)   ((x)->quantization_mode = (v))
+#define XAI_CNN_TANH_GET_INPUT_RANGE_RADIUS(x)     ((x)->inputRangeRadius)
+#define XAI_CNN_TANH_SET_INPUT_RANGE_RADIUS(x, v)  ((x)->inputRangeRadius = (v))
+#define XAI_CNN_TANH_GET_INPUT_SCALE(x)            ((x)->inputScale)
+#define XAI_CNN_TANH_SET_INPUT_SCALE(x, v)         ((x)->inputScale = (v))
+#define XAI_CNN_TANH_GET_INPUT_SHIFT(x)            ((x)->inputShift)
+#define XAI_CNN_TANH_SET_INPUT_SHIFT(x, v)         ((x)->inputShift = (v))
+#define XAI_CNN_TANH_GET_INPUT_ZERO_POINT(x)       ((x)->inputZeroPoint)
+#define XAI_CNN_TANH_SET_INPUT_ZERO_POINT(x, v)    ((x)->inputZeroPoint = (v))
+
+
+typedef struct
+{
+  int32_t outputScaleIdentity;
+  int32_t outputShiftIdentity;
+  int32_t outputScaleAlpha;
+  int32_t outputShiftAlpha;
+  int32_t inputOffset;
+  int32_t outputOffset;
+  int8_t  quantization_mode;
+} xai_cnn_tfl_leakyrelu_params;
+
+#define XAI_CNN_LEAKYRELU_GET_OUTPUT_SCALE_IDENTITY(x)     ((x)->outputScaleIdentity)
+#define XAI_CNN_LEAKYRELU_SET_OUTPUT_SCALE_IDENTITY(x, v)  ((x)->outputScaleIdentity = (v))
+#define XAI_CNN_LEAKYRELU_GET_OUTPUT_SHIFT_IDENTITY(x)     ((x)->outputShiftIdentity)
+#define XAI_CNN_LEAKYRELU_SET_OUTPUT_SHIFT_IDENTITY(x, v)  ((x)->outputShiftIdentity = (v))
+#define XAI_CNN_LEAKYRELU_GET_OUTPUT_SCALE_ALPHA(x)        ((x)->outputScaleAlpha)
+#define XAI_CNN_LEAKYRELU_SET_OUTPUT_SCALE_ALPHA(x, v)     ((x)->outputScaleAlpha = (v))
+#define XAI_CNN_LEAKYRELU_GET_OUTPUT_SHIFT_ALPHA(x)        ((x)->outputShiftAlpha)
+#define XAI_CNN_LEAKYRELU_SET_OUTPUT_SHIFT_ALPHA(x, v)     ((x)->outputShiftAlpha = (v))
+#define XAI_CNN_LEAKYRELU_GET_INPUT_OFFSET(x)              ((x)->inputOffset)
+#define XAI_CNN_LEAKYRELU_SET_INPUT_OFFSET(x, v)           ((x)->inputOffset = (v))
+#define XAI_CNN_LEAKYRELU_GET_OUTPUT_OFFSET(x)             ((x)->outputOffset)
+#define XAI_CNN_LEAKYRELU_SET_OUTPUT_OFFSET(x, v)          ((x)->outputOffset = (v))
+#define XAI_CNN_LEAKYRELU_GET_QUANTIZATION_MODE(x)         ((x)->quantization_mode)
+#define XAI_CNN_LEAKYRELU_SET_QUANTIZATION_MODE(x, v)      ((x)->quantization_mode = (v))
+
+typedef struct
+{
+  int32_t outputScalePositive;
+  int32_t outputScaleNegative;
+  int32_t outputShiftPositive;
+  int32_t outputShiftNegative;
+  int32_t inputOffset;
+  int32_t outputOffset;
+  int32_t alphaOffset;
+  int8_t  quantization_mode;
+} xai_cnn_tfl_prelu_params;
+
+#define XAI_CNN_PRELU_GET_OUTPUT_SCALE_POSITIVE(x)     ((x)->outputScalePositive)
+#define XAI_CNN_PRELU_SET_OUTPUT_SCALE_POSITIVE(x, v)  ((x)->outputScalePositive = (v))
+#define XAI_CNN_PRELU_GET_OUTPUT_SHIFT_POSITIVE(x)     ((x)->outputShiftPositive)
+#define XAI_CNN_PRELU_SET_OUTPUT_SHIFT_POSITIVE(x, v)  ((x)->outputShiftPositive = (v))
+#define XAI_CNN_PRELU_GET_OUTPUT_SCALE_NEGATIVE(x)     ((x)->outputScaleNegative)
+#define XAI_CNN_PRELU_SET_OUTPUT_SCALE_NEGATIVE(x, v)  ((x)->outputScaleNegative = (v))
+#define XAI_CNN_PRELU_GET_OUTPUT_SHIFT_NEGATIVE(x)     ((x)->outputShiftNegative)
+#define XAI_CNN_PRELU_SET_OUTPUT_SHIFT_NEGATIVE(x, v)  ((x)->outputShiftNegative = (v))
+#define XAI_CNN_PRELU_GET_INPUT_OFFSET(x)              ((x)->inputOffset)
+#define XAI_CNN_PRELU_SET_INPUT_OFFSET(x, v)           ((x)->inputOffset = (v))
+#define XAI_CNN_PRELU_GET_OUTPUT_OFFSET(x)             ((x)->outputOffset)
+#define XAI_CNN_PRELU_SET_OUTPUT_OFFSET(x, v)          ((x)->outputOffset = (v))
+#define XAI_CNN_PRELU_GET_ALPHA_OFFSET(x)              ((x)->alphaOffset)
+#define XAI_CNN_PRELU_SET_ALPHA_OFFSET(x, v)           ((x)->alphaOffset = (v))
+#define XAI_CNN_PRELU_GET_QUANTIZATION_MODE(x)         ((x)->quantization_mode)
+#define XAI_CNN_PRELU_SET_QUANTIZATION_MODE(x, v)      ((x)->quantization_mode = (v))
+
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+typedef struct
+{
+  int32_t axis;                  // axis along which softmax is to be computed
+  xb_f16  beta;                  // multiplication factor
+} xai_cnn_softmaxA3D_F16_params;
+
+#define XAI_CNN_SOFTMAXAF16_PARAMS_GET_AXIS(x)     ((x)->axis)
+#define XAI_CNN_SOFTMAXAF16_PARAMS_GET_BETA(x)     ((x)->beta)
+#define XAI_CNN_SOFTMAXAF16_PARAMS_SET_AXIS(x, v)  ((x)->axis = (v))
+#define XAI_CNN_SOFTMAXAF16_PARAMS_SET_BETA(x, v)  ((x)->beta = (v))
+#endif // #if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+typedef struct
+{
+  int32_t axis;                 // axis along which softmax is to be computed
+  float   beta;                 // multiplication factor
+} xai_cnn_softmaxA3D_F32_params;
+
+#define XAI_CNN_SOFTMAXAF32_PARAMS_GET_AXIS(x)     ((x)->axis)
+#define XAI_CNN_SOFTMAXAF32_PARAMS_GET_BETA(x)     ((x)->beta)
+#define XAI_CNN_SOFTMAXAF32_PARAMS_SET_AXIS(x, v)  ((x)->axis = (v))
+#define XAI_CNN_SOFTMAXAF32_PARAMS_SET_BETA(x, v)  ((x)->beta = (v))
+#endif // #if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+
+typedef struct
+{
+  int16_t maxVal;   /* global max value of a 3D tile */
+  uint8_t tileFlag; /* tileFlag can take values 0-3.
+                       0 : neither first not last tile
+                       1 : first tile
+                       2 : last tile
+                       3 : first and last tile. */
+} xai_cnn_maxval_params;
+
+#define XAI_CNN_MAXVAL_GET_MAXVAL(x)       ((x)->maxVal)
+#define XAI_CNN_MAXVAL_SET_MAXVAL(x, v)    ((x)->maxVal = (v))
+#define XAI_CNN_MAXVAL_GET_TILEFLAG(x)     ((x)->tileFlag)
+#define XAI_CNN_MAXVAL_SET_TILEFLAG(x, v)  ((x)->tileFlag = (v))
+
+typedef struct
+{
+  uint16_t input1Scale;  /* Scaling factor for 1st input */
+  uint16_t input2Scale;  /* Scaling factor for 2nd input */
+  uint8_t  accumShift;   /* Accumulator Shift to bring data to 16b after scaling and addition */
+  uint16_t outputScale;  /* Scaling factor for Output */
+  uint8_t  outputShift;  /* Shift value to bring the final sum to 8b */
+  uint8_t  reluFlag;     /* Enable/Disable Relu at the output */
+  int32_t  minVal;       /* minimum Value for clamping if reluFlag is set to 1 */
+  int32_t  maxVal;       /* maximum Value for clamping if reluFlag is set to 1 */
+  uint8_t  stride;       /* Stride factor */
+  int32_t  fixUpInit;    /* The fixUp term that is used to incorporte Zero Points*/
+  uint8_t  sat11;        /* Dummy. Not used for xai_cnn_eltwise_params. Used only in xaicnne. Added it for consistency */
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+  xb_f16   reluMinFlt;
+  xb_f16   reluMaxFlt;
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+  float reluMinFlt32;
+  float reluMaxFlt32;
+#endif
+} xai_cnn_eltwise_params;
+
+typedef struct
+{
+  int32_t input1Scale;   /* Scaling factor for 1st input */
+  int32_t input2Scale;   /* Scaling factor for 2nd input */
+  int32_t input1Shift;   /* Shift for 1st input */
+  int32_t input2Shift;   /* Shift for 2nd input */
+  int32_t leftShift;     /* Left Shift for both input */
+  int32_t outputScale;   /* Scaling factor for Output */
+  int32_t outputShift;   /* Shift value to bring the final sum to 8b */
+  int32_t input1Offset;
+  int32_t input2Offset;
+  int32_t outputOffset;
+  uint8_t reluFlag;      /* Enable/Disable Relu at the output */
+  int32_t minVal;        /* minimum Value for clamping if reluFlag is set to 1 */
+  int32_t maxVal;        /* maximum Value for clamping if reluFlag is set to 1 */
+  uint8_t stride;        /* Stride factor */
+  int8_t  quantization_mode;
+}xai_cnn_tfl_eltwise_params;
+
+typedef struct
+{
+  int16_t  input1Scale;       /* Scaling factor for 1st input */
+  int16_t  input2Scale;       /* Scaling factor for 2nd input */
+  uint8_t  accumShift;        /* Accumulator Shift to bring data to 16b after scaling and addition */
+  uint16_t outputScale;       /* Scaling factor for Output */
+  uint8_t  outputShift;       /* Shift value to bring the final sum to 8b */
+  uint8_t  reluFlag;          /* Enable/Disable Relu at the output */
+  int32_t  minVal;            /* minimum Value for clamping if reluFlag is set to 1 */
+  int32_t  maxVal;            /* maximum Value for clamping if reluFlag is set to 1 */
+  uint8_t  stride;            /* Stride factor */
+  int32_t  fixUpInit;         /* The fixUp term that is used to incorporte Zero Points*/
+  uint8_t  sat11;             /* Quantization saturation: 0 - 10 bit; 1 - 11 bit; */
+} xnne_eltwise_params;
+
+#define XAI_CNN_ELTWISE_GET_INPUT1SCALE(x)           ((x)->input1Scale)
+#define XAI_CNN_ELTWISE_SET_INPUT1SCALE(x, v)        ((x)->input1Scale = (v))
+#define XAI_CNN_ELTWISE_GET_INPUT2SCALE(x)           ((x)->input2Scale)
+#define XAI_CNN_ELTWISE_SET_INPUT2SCALE(x, v)        ((x)->input2Scale = (v))
+#define XAI_CNN_ELTWISE_GET_INPUT1SHIFT(x)           ((x)->input1Shift)
+#define XAI_CNN_ELTWISE_SET_INPUT1SHIFT(x, v)        ((x)->input1Shift = (v))
+#define XAI_CNN_ELTWISE_GET_INPUT2SHIFT(x)           ((x)->input2Shift)
+#define XAI_CNN_ELTWISE_SET_INPUT2SHIFT(x, v)        ((x)->input2Shift = (v))
+#define XAI_CNN_ELTWISE_GET_LEFTSHIFT(x)             ((x)->leftShift)
+#define XAI_CNN_ELTWISE_SET_LEFTSHIFT(x, v)          ((x)->leftShift = (v))
+#define XAI_CNN_ELTWISE_GET_ACCUMSHIFT(x)            ((x)->accumShift)
+#define XAI_CNN_ELTWISE_SET_ACCUMSHIFT(x, v)         ((x)->accumShift = (v))
+#define XAI_CNN_ELTWISE_GET_OUTPUTSCALE(x)           ((x)->outputScale)
+#define XAI_CNN_ELTWISE_SET_OUTPUTSCALE(x, v)        ((x)->outputScale = (v))
+#define XAI_CNN_ELTWISE_GET_OUTPUTSHIFT(x)           ((x)->outputShift)
+#define XAI_CNN_ELTWISE_SET_OUTPUTSHIFT(x, v)        ((x)->outputShift = (v))
+#define XAI_CNN_ELTWISE_GET_INPUT1_OFFSET(x)         ((x)->input1Offset)
+#define XAI_CNN_ELTWISE_SET_INPUT1_OFFSET(x, v)      ((x)->input1Offset = (v))
+#define XAI_CNN_ELTWISE_GET_INPUT2_OFFSET(x)         ((x)->input2Offset)
+#define XAI_CNN_ELTWISE_SET_INPUT2_OFFSET(x, v)      ((x)->input2Offset = (v))
+#define XAI_CNN_ELTWISE_GET_OUTPUT_OFFSET(x)         ((x)->outputOffset)
+#define XAI_CNN_ELTWISE_SET_OUTPUT_OFFSET(x, v)      ((x)->outputOffset = (v))
+#define XAI_CNN_ELTWISE_GET_QUANTIZATION_MODE(x)     ((x)->quantization_mode)
+#define XAI_CNN_ELTWISE_SET_QUANTIZATION_MODE(x, v)  ((x)->quantization_mode = (v))
+#define XAI_CNN_ELTWISE_GET_RELUFLAG(x)              ((x)->reluFlag)
+#define XAI_CNN_ELTWISE_SET_RELUFLAG(x, v)           ((x)->reluFlag = (v))
+#define XAI_CNN_ELTWISE_GET_MIN_VAL(x)               ((x)->minVal)
+#define XAI_CNN_ELTWISE_SET_MIN_VAL(x, v)            ((x)->minVal = (v))
+#define XAI_CNN_ELTWISE_GET_MAX_VAL(x)               ((x)->maxVal)
+#define XAI_CNN_ELTWISE_SET_MAX_VAL(x, v)            ((x)->maxVal = (v))
+#define XAI_CNN_ELTWISE_GET_STRIDE(x)                ((x)->stride)
+#define XAI_CNN_ELTWISE_SET_STRIDE(x, v)             ((x)->stride = (v))
+#define XAI_CNN_ELTWISE_GET_FIXUPINIT(x)             ((x)->fixUpInit)
+#define XAI_CNN_ELTWISE_SET_FIXUPINIT(x, v)          ((x)->fixUpInit = (v))
+#define XAI_CNN_ELTWISE_GET_SAT11(x)                 ((x)->sat11)
+#define XAI_CNN_ELTWISE_SET_SAT11(x, v)              ((x)->sat11 = (v))
+#define XAI_CNN_ELTWISE_ADD_STRIDE_J1    (1)
+#define XAI_CNN_ELTWISE_ADD_STRIDE_J2    (2)
+#define XAI_CNN_ELTWISE_ADD_STRIDE_J1J2  (3)
+#define XAI_CNN_ELTWISE_SUB_STRIDE_J1    (1)
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+#define XAI_CNN_ELTWISE_GET_RELU_MIN_FLT(x)     ((x)->reluMinFlt)
+#define XAI_CNN_ELTWISE_SET_RELU_MIN_FLT(x, v)  ((x)->reluMinFlt = (v))
+#define XAI_CNN_ELTWISE_GET_RELU_MAX_FLT(x)     ((x)->reluMaxFlt)
+#define XAI_CNN_ELTWISE_SET_RELU_MAX_FLT(x, v)  ((x)->reluMaxFlt = (v))
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+#define XAI_CNN_ELTWISE_GET_RELU_MIN_FLT32(x)     ((x)->reluMinFlt32)
+#define XAI_CNN_ELTWISE_SET_RELU_MIN_FLT32(x, v)  ((x)->reluMinFlt32 = (v))
+#define XAI_CNN_ELTWISE_GET_RELU_MAX_FLT32(x)     ((x)->reluMaxFlt32)
+#define XAI_CNN_ELTWISE_SET_RELU_MAX_FLT32(x, v)  ((x)->reluMaxFlt32 = (v))
+#endif
+
+typedef struct
+{
+  uint16_t inputScale;     /* Scaling factor for Input */
+  uint8_t  inputShift;     /* Input Shift to bring data to 16b after scaling */
+  int32_t  minIdx;         /* Minimum value of input. Corresponds to first element of LUT array. */
+  int32_t  maxIdx;         /* Maximum value of input. Corresponds to last element of LUT array. */
+  uint8_t  tableType;      /* Value to describe the type of Table: 0/1/2 - Normal/Symmetric/Asymmetric */
+  int32_t  lut1Offset;     /* Offset of the 0th entry of lut1Array in Full range LUT table(minIdx <= lut1Offset <= maxIdx). */
+  int32_t  lut2Offset;     /* Offset of the 0th entry of lut2Array in Full range LUT table(minIdx <= lut2Offset <= maxIdx). */
+} xai_cnn_lut_params;
+
+#define XAI_LUT_TYPE_NORMAL         0
+#define XAI_LUT_TYPE_EVENSYMMETRIC  1
+#define XAI_LUT_TYPE_ODDSYMMETRIC   2
+
+#define XAI_CNN_LUT_GET_INPUTSCALE(x)      ((x)->inputScale)
+#define XAI_CNN_LUT_SET_INPUTSCALE(x, v)   ((x)->inputScale = (v))
+#define XAI_CNN_LUT_GET_INPUTSHIFT(x)      ((x)->inputShift)
+#define XAI_CNN_LUT_SET_INPUTSHIFT(x, v)   ((x)->inputShift = (v))
+#define XAI_CNN_LUT_GET_MIN_IDX(x)         ((x)->minIdx)
+#define XAI_CNN_LUT_SET_MIN_IDX(x, v)      ((x)->minIdx = (v))
+#define XAI_CNN_LUT_GET_MAX_IDX(x)         ((x)->maxIdx)
+#define XAI_CNN_LUT_SET_MAX_IDX(x, v)      ((x)->maxIdx = (v))
+#define XAI_CNN_LUT_GET_TABLE_TYPE(x)      ((x)->tableType)
+#define XAI_CNN_LUT_SET_TABLE_TYPE(x, v)   ((x)->tableType = (v))
+#define XAI_CNN_LUT_GET_LUT1_OFFSET(x)     ((x)->lut1Offset)
+#define XAI_CNN_LUT_SET_LUT1_OFFSET(x, v)  ((x)->lut1Offset = v)
+#define XAI_CNN_LUT_GET_LUT2_OFFSET(x)     ((x)->lut2Offset)
+#define XAI_CNN_LUT_SET_LUT2_OFFSET(x, v)  ((x)->lut2Offset = v)
+
+typedef struct
+{
+  int16_t outputScale;   /* Scaling factor for Output */
+  uint8_t outputShift;   /* Shift value to bring the final product to output datatype */
+  uint8_t reluFlag;      /* Enable/Disable Relu at the output */
+  int32_t minVal;        /* minimum Value for clamping */
+  int32_t maxVal;        /* maximum Value for clamping */
+  int32_t inZero1;
+  int32_t inZero2;
+  int32_t fixUpInit;
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+  xb_f16  reluMinFlt;
+  xb_f16  reluMaxFlt;
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+  float reluMinFlt32;
+  float reluMaxFlt32;
+#endif
+} xai_cnn_eltwiseMul_params;
+
+#define XAI_CNN_ELTWISE_MUL_GET_OUTPUTSCALE(x)      ((x)->outputScale)
+#define XAI_CNN_ELTWISE_MUL_SET_OUTPUTSCALE(x, v)   ((x)->outputScale = (v))
+#define XAI_CNN_ELTWISE_MUL_GET_OUTPUTSHIFT(x)      ((x)->outputShift)
+#define XAI_CNN_ELTWISE_MUL_SET_OUTPUTSHIFT(x, v)   ((x)->outputShift = (v))
+#define XAI_CNN_ELTWISE_MUL_GET_RELUFLAG(x)         ((x)->reluFlag)
+#define XAI_CNN_ELTWISE_MUL_SET_RELUFLAG(x, v)      ((x)->reluFlag = (v))
+#define XAI_CNN_ELTWISE_MUL_GET_MIN_VAL(x)          ((x)->minVal)
+#define XAI_CNN_ELTWISE_MUL_SET_MIN_VAL(x, v)       ((x)->minVal = (v))
+#define XAI_CNN_ELTWISE_MUL_GET_MAX_VAL(x)          ((x)->maxVal)
+#define XAI_CNN_ELTWISE_MUL_SET_MAX_VAL(x, v)       ((x)->maxVal = (v))
+#define XAI_CNN_ELTWISE_MUL_GET_INZERO_1(x)         ((x)->inZero1)
+#define XAI_CNN_ELTWISE_MUL_SET_INZERO_1(x, v)      ((x)->inZero1 = (v))
+#define XAI_CNN_ELTWISE_MUL_GET_INZERO_2(x)         ((x)->inZero2)
+#define XAI_CNN_ELTWISE_MUL_SET_INZERO_2(x, v)      ((x)->inZero2 = (v))
+#define XAI_CNN_ELTWISE_MUL_GET_FIXUPINIT(x)        ((x)->fixUpInit)
+#define XAI_CNN_ELTWISE_MUL_SET_FIXUPINIT(x, v)     ((x)->fixUpInit = (v))
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+#define XAI_CNN_ELTWISE_MUL_GET_RELU_MIN_FLT(x)     ((x)->reluMinFlt)
+#define XAI_CNN_ELTWISE_MUL_SET_RELU_MIN_FLT(x, v)  ((x)->reluMinFlt = (v))
+#define XAI_CNN_ELTWISE_MUL_GET_RELU_MAX_FLT(x)     ((x)->reluMaxFlt)
+#define XAI_CNN_ELTWISE_MUL_SET_RELU_MAX_FLT(x, v)  ((x)->reluMaxFlt = (v))
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+#define XAI_CNN_ELTWISE_MUL_GET_RELU_MIN_FLT32(x)     ((x)->reluMinFlt32)
+#define XAI_CNN_ELTWISE_MUL_SET_RELU_MIN_FLT32(x, v)  ((x)->reluMinFlt32 = (v))
+#define XAI_CNN_ELTWISE_MUL_GET_RELU_MAX_FLT32(x)     ((x)->reluMaxFlt32)
+#define XAI_CNN_ELTWISE_MUL_SET_RELU_MAX_FLT32(x, v)  ((x)->reluMaxFlt32 = (v))
+#endif
+
+/*SVDF structure */
+typedef struct
+{
+  int32_t nInput;
+  int32_t nFilter;
+  int32_t nMemory;
+  int32_t nBatch;
+  int32_t nRank;
+  int32_t biasFlag;
+  uint8_t shift1;
+  uint8_t shift2;
+  uint8_t accShift1;
+  uint8_t accShift2;
+  int32_t preset;
+  int32_t minVal;
+  int32_t maxVal;
+  uint8_t reluFlag;
+} xai_cnn_svdf_params;
+
+#define S24_MIN                  (-(((int32_t) 1) << 23))
+#define S24_MAX                  ((((int32_t) 1) << 23) - 1)
+#define XCHAL_IVPN_SIMD_WIDTH_2  (XCHAL_IVPN_SIMD_WIDTH >> 1)
+#define USE_24_BIT_ACCUMULATOR
+#define MULQISA                  1
+
+#define XAI_CNN_SVDF_GET_NUMINPUT(x)         ((x)->nInput)
+#define XAI_CNN_SVDF_SET_NUMINPUT(x, v)      ((x)->nInput = (v))
+#define XAI_CNN_SVDF_GET_MIN_VAL(x)          ((x)->minVal)
+#define XAI_CNN_SVDF_SET_MIN_VAL(x, v)       ((x)->minVal = (v))
+#define XAI_CNN_SVDF_GET_MAX_VAL(x)          ((x)->maxVal)
+#define XAI_CNN_SVDF_SET_MAX_VAL(x, v)       ((x)->maxVal = (v))
+#define XAI_CNN_SVDF_GET_RELUFLAG(x)         ((x)->reluFlag)
+#define XAI_CNN_SVDF_SET_RELUFLAG(x, v)      ((x)->reluFlag = (v))
+#define XAI_CNN_SVDF_GET_NUMFILTER(x)        ((x)->nFilter)
+#define XAI_CNN_SVDF_SET_NUMFILTER(x, v)     ((x)->nFilter = (v))
+#define XAI_CNN_SVDF_GET_NUMMEMORY(x)        ((x)->nMemory)
+#define XAI_CNN_SVDF_SET_NUMMEMORY(x, v)     ((x)->nMemory = (v))
+#define XAI_CNN_SVDF_GET_NUMBATCH(x)         ((x)->nBatch)
+#define XAI_CNN_SVDF_SET_NUMBATCH(x, v)      ((x)->nBatch = (v))
+#define XAI_CNN_SVDF_GET_BIASFLAG(x)         ((x)->biasFlag)
+#define XAI_CNN_SVDF_SET_BIASFLAG(x, v)      ((x)->biasFlag = (v))
+#define XAI_CNN_SVDF_GET_RANK(x)             ((x)->nRank)
+#define XAI_CNN_SVDF_SET_RANK(x, v)          ((x)->nRank = (v))
+#define XAI_CNN_SVDF_GET_NUNIT(x)            ((x)->nUnit
+#define XAI_CNN_SVDF_SET_NUNIT(x, v)         ((x)->nUnit = (v))
+#define XAI_CNN_SVDF_GET_OUTPUTSHIFT1(x)     ((x)->shift1)
+#define XAI_CNN_SVDF_SET_OUTPUTSHIFT1(x, v)  ((x)->shift1 = (v))
+#define XAI_CNN_SVDF_GET_OUTPUTSHIFT2(x)     ((x)->shift2)
+#define XAI_CNN_SVDF_SET_OUTPUTSHIFT2(x, v)  ((x)->shift2 = (v))
+#define XAI_CNN_SVDF_GET_ACCSHIFT1(x)        ((x)->accShift1)
+#define XAI_CNN_SVDF_SET_ACCSHIFT1(x, v)     ((x)->accShift1 = (v))
+#define XAI_CNN_SVDF_GET_ACCSHIFT2(x)        ((x)->accShift2)
+#define XAI_CNN_SVDF_SET_ACCSHIFT2(x, v)     ((x)->accShift2 = (v))
+#define XAI_CNN_SVDF_GET_PRESET(x)           ((x)->preset)
+#define XAI_CNN_SVDF_SET_PRESET(x, v)        ((x)->preset = (v))
+
+typedef struct
+{
+  uint16_t tableLength0;    /* Minor table (Table 0) length */
+  uint16_t tableLength1;    /* Major table (Table 1) length */
+  uint16_t inMask0;         /* Mask applied on input while accessing minor table entry */
+  uint16_t inMask1;         /* Mask applied on input while accessing major table entry */
+  uint8_t  inShift0;        /* Shift applied on input while accessing minor table entry */
+  uint8_t  inShift1;        /* Shift applied on input while accessing major table entry */
+  uint8_t  outputShift;     /* No. of output bits to be right shifted. */
+} xai_cnn_exponent_params;
+
+#define XAI_CNN_EXPONENT_GET_OUTPUTSHIFT(x)       ((x)->outputShift)
+#define XAI_CNN_EXPONENT_SET_OUTPUTSHIFT(x, v)    ((x)->outputShift = (v))
+#define XAI_CNN_EXPONENT_GET_TABLELENGTH_0(x)     ((x)->tableLength0)
+#define XAI_CNN_EXPONENT_SET_TABLELENGTH_0(x, v)  ((x)->tableLength0 = (v))
+#define XAI_CNN_EXPONENT_GET_TABLELENGTH_1(x)     ((x)->tableLength1)
+#define XAI_CNN_EXPONENT_SET_TABLELENGTH_1(x, v)  ((x)->tableLength1 = (v))
+#define XAI_CNN_EXPONENT_GET_MASK_0(x)            ((x)->inMask0)
+#define XAI_CNN_EXPONENT_SET_MASK_0(x, v)         ((x)->inMask0 = (v))
+#define XAI_CNN_EXPONENT_GET_MASK_1(x)            ((x)->inMask1)
+#define XAI_CNN_EXPONENT_SET_MASK_1(x, v)         ((x)->inMask1 = (v))
+#define XAI_CNN_EXPONENT_GET_SHIFT_0(x)           ((x)->inShift0)
+#define XAI_CNN_EXPONENT_SET_SHIFT_0(x, v)        ((x)->inShift0 = (v))
+#define XAI_CNN_EXPONENT_GET_SHIFT_1(x)           ((x)->inShift1)
+#define XAI_CNN_EXPONENT_SET_SHIFT_1(x, v)        ((x)->inShift1 = (v))
+
+typedef struct
+{
+  uint8_t stride;       /* Stride factor */
+  uint8_t reverse;      /* Flag to indicate direction of reorg */
+} xai_cnn_reorg_params;
+
+#define XAI_CNN_REORG_GET_STRIDE(x)      ((x)->stride)
+#define XAI_CNN_REORG_SET_STRIDE(x, v)   ((x)->stride = (v))
+#define XAI_CNN_REORG_GET_REVERSE(x)     ((x)->reverse)
+#define XAI_CNN_REORG_SET_REVERSE(x, v)  ((x)->reverse = (v))
+
+typedef struct
+{
+  uint8_t strideX;      /* StrideX factor */
+  uint8_t strideY;      /* StrideY factor */
+  uint8_t reverse;      /* Flag to indicate direction of reorg */
+} xai_cnn_reorg4D_params;
+
+#define XAI_CNN_REORG4D_GET_STRIDEX(x)     ((x)->strideX)
+#define XAI_CNN_REORG4D_SET_STRIDEX(x, v)  ((x)->strideX = (v))
+#define XAI_CNN_REORG4D_GET_STRIDEY(x)     ((x)->strideY)
+#define XAI_CNN_REORG4D_SET_STRIDEY(x, v)  ((x)->strideY = (v))
+#define XAI_CNN_REORG4D_GET_REVERSE(x)     ((x)->reverse)
+#define XAI_CNN_REORG4D_SET_REVERSE(x, v)  ((x)->reverse = (v))
+
+typedef struct
+{
+  uint8_t order1;   /* inTile dimension which will be transposed into dimension 1 of outTile */
+  uint8_t order2;   /* inTile dimension which will be transposed into dimension 2 of outTile */
+  uint8_t order3;   /* inTile dimension which will be transposed into dimension 3 of outTile */
+  uint8_t order4;   /* inTile dimension which will be transposed into dimension 4 of outTile */
+}xai_cnn_permute4D_params;
+
+#define XAI_CNN_PERMUTE4D_GET_ORDER1(x)     ((x)->order1)
+#define XAI_CNN_PERMUTE4D_SET_ORDER1(x, v)  ((x)->order1 = (v))
+#define XAI_CNN_PERMUTE4D_GET_ORDER2(x)     ((x)->order2)
+#define XAI_CNN_PERMUTE4D_SET_ORDER2(x, v)  ((x)->order2 = (v))
+#define XAI_CNN_PERMUTE4D_GET_ORDER3(x)     ((x)->order3)
+#define XAI_CNN_PERMUTE4D_SET_ORDER3(x, v)  ((x)->order3 = (v))
+#define XAI_CNN_PERMUTE4D_GET_ORDER4(x)     ((x)->order4)
+#define XAI_CNN_PERMUTE4D_SET_ORDER4(x, v)  ((x)->order4 = (v))
+
+typedef struct
+{
+  uint32_t groups;       /* Input Groups */
+} xai_cnn_shuffle3D_params;
+
+#define XAI_CNN_SHUFFLE_GET_INTERLEAVEGROUPS(x)     ((x)->groups)
+#define XAI_CNN_SHUFFLE_SET_INTERLEAVEGROUPS(x, v)  ((x)->groups = (v))
+
+typedef struct
+{
+  int32_t xscale;    //Q13.18 format in xaicnn and Q21.10 format in TFL
+  int32_t yscale;    //Q13.18 format in xaicnn and Q21.10 format in TFL
+  int32_t xshift;    //Q13.18 format in xaicnn and Q21.10 format in TFL
+  int32_t yshift;    //Q13.18 format in xaicnn and Q21.10 format in TFL
+  uint8_t extrapolationFlag;
+  int32_t extrapolationValue;
+  int32_t inputFrameWidth;
+  int32_t inputFrameHeight;
+  int8_t  alignCorners;
+  int8_t  halfPixelCenters;
+  float   xscaleFlt;
+  float   yscaleFlt;
+  float   xshiftFlt;
+  float   yshiftFlt;
+  int8_t  quantization_mode;
+} xai_cnn_interp3D_params;
+
+#define XAI_CNN_INTERP3D_GET_XSCALE(x)                      ((x)->xscale)
+#define XAI_CNN_INTERP3D_GET_YSCALE(x)                      ((x)->yscale)
+#define XAI_CNN_INTERP3D_GET_XSHIFT(x)                      ((x)->xshift)
+#define XAI_CNN_INTERP3D_GET_YSHIFT(x)                      ((x)->yshift)
+#define XAI_CNN_INTERP3D_GET_EXTRAPOLATION_FLAG(x)          ((x)->extrapolationFlag)
+#define XAI_CNN_INTERP3D_GET_EXTRAPOLATION_VALUE(x)         ((x)->extrapolationValue)
+#define XAI_CNN_INTERP3D_GET_FRAME_WIDTH(x)                 ((x)->inputFrameWidth)
+#define XAI_CNN_INTERP3D_GET_FRAME_HEIGHT(x)                ((x)->inputFrameHeight)
+#define XAI_CNN_INTERP3D_GET_FLAG_ALIGN_CORNERS(x)          ((x)->alignCorners)
+#define XAI_CNN_INTERP3D_GET_FLAG_HALF_PIXEL_CENTERS(x)     ((x)->halfPixelCenters)
+#define XAI_CNN_INTERP3D_GET_XSCALE_FLT(x)                  ((x)->xscaleFlt)
+#define XAI_CNN_INTERP3D_GET_YSCALE_FLT(x)                  ((x)->yscaleFlt)
+#define XAI_CNN_INTERP3D_GET_XSHIFT_FLT(x)                  ((x)->xshiftFlt)
+#define XAI_CNN_INTERP3D_GET_YSHIFT_FLT(x)                  ((x)->yshiftFlt)
+
+#define XAI_CNN_INTERP3D_SET_XSCALE(x, v)                   ((x)->xscale = (v))
+#define XAI_CNN_INTERP3D_SET_YSCALE(x, v)                   ((x)->yscale = (v))
+#define XAI_CNN_INTERP3D_SET_XSHIFT(x, v)                   ((x)->xshift = (v))
+#define XAI_CNN_INTERP3D_SET_YSHIFT(x, v)                   ((x)->yshift = (v))
+#define XAI_CNN_INTERP3D_SET_EXTRAPOLATION_FLAG(x, v)       ((x)->extrapolationFlag = (v))
+#define XAI_CNN_INTERP3D_SET_EXTRAPOLATION_VALUE(x, v)      ((x)->extrapolationValue = (v))
+#define XAI_CNN_INTERP3D_SET_FRAME_WIDTH(x, v)              ((x)->inputFrameWidth = (v))
+#define XAI_CNN_INTERP3D_SET_FRAME_HEIGHT(x, v)             ((x)->inputFrameHeight = (v))
+#define XAI_CNN_INTERP3D_SET_FLAG_ALIGN_CORNERS(x, v)       ((x)->alignCorners = v)
+#define XAI_CNN_INTERP3D_SET_FLAG_HALF_PIXEL_CENTERS(x, v)  ((x)->halfPixelCenters = v)
+#define XAI_CNN_INTERP3D_SET_XSCALE_FLT(x, v)               ((x)->xscaleFlt = (v))
+#define XAI_CNN_INTERP3D_SET_YSCALE_FLT(x, v)               ((x)->yscaleFlt = (v))
+#define XAI_CNN_INTERP3D_SET_XSHIFT_FLT(x, v)               ((x)->xshiftFlt = (v))
+#define XAI_CNN_INTERP3D_SET_YSHIFT_FLT(x, v)               ((x)->yshiftFlt = (v))
+#define XAI_CNN_INTERP3D_GET_QUANTIZATION_MODE(x)           ((x)->quantization_mode)
+#define XAI_CNN_INTERP3D_SET_QUANTIZATION_MODE(x, v)        ((x)->quantization_mode = (v))
+
+typedef struct
+{
+  int32_t xscale;  //Q13.18 format
+  int32_t yscale;  //Q13.18 format
+  int32_t xshift;  //Q13.18 format
+  int32_t yshift;  //Q13.18 format
+  int32_t inputFrameWidth;
+  int32_t inputFrameHeight;
+  int8_t  alignCorners;
+  int8_t  halfPixelCenters;
+  float   xscaleFlt;
+  float   yscaleFlt;
+  float   xshiftFlt;
+  float   yshiftFlt;
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+  xb_f16  xscaleFlt16;
+  xb_f16  yscaleFlt16;
+#endif
+  int8_t  quantization_mode;
+} xai_cnn_resize_nearest3D_params;
+
+#define XAI_CNN_RESIZENEAREST3D_GET_XSCALE(x)                   ((x)->xscale)
+#define XAI_CNN_RESIZENEAREST3D_GET_YSCALE(x)                   ((x)->yscale)
+#define XAI_CNN_RESIZENEAREST3D_GET_XSHIFT(x)                   ((x)->xshift)
+#define XAI_CNN_RESIZENEAREST3D_GET_YSHIFT(x)                   ((x)->yshift)
+#define XAI_CNN_RESIZENEAREST3D_GET_FLAG_ALIGN_CORNERS(x)       ((x)->alignCorners)
+#define XAI_CNN_RESIZENEAREST3D_GET_FLAG_HALF_PIXEL_CENTERS(x)  ((x)->halfPixelCenters)
+#define XAI_CNN_RESIZENEAREST3D_GET_FRAME_WIDTH(x)              ((x)->inputFrameWidth)
+#define XAI_CNN_RESIZENEAREST3D_GET_FRAME_HEIGHT(x)             ((x)->inputFrameHeight)
+#define XAI_CNN_RESIZENEAREST3D_GET_XSCALE_FLT(x)               ((x)->xscaleFlt)
+#define XAI_CNN_RESIZENEAREST3D_GET_YSCALE_FLT(x)               ((x)->yscaleFlt)
+#define XAI_CNN_RESIZENEAREST3D_GET_XSHIFT_FLT(x)               ((x)->xshiftFlt)
+#define XAI_CNN_RESIZENEAREST3D_GET_YSHIFT_FLT(x)               ((x)->yshiftFlt)
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+#define XAI_CNN_RESIZENEAREST3D_GET_XSCALE_FLT16(x)             ((x)->xscaleFlt16)
+#define XAI_CNN_RESIZENEAREST3D_GET_YSCALE_FLT16(x)             ((x)->yscaleFlt16)
+#endif
+
+#define XAI_CNN_RESIZENEAREST3D_SET_XSCALE(x, v)                   ((x)->xscale = (v))
+#define XAI_CNN_RESIZENEAREST3D_SET_YSCALE(x, v)                   ((x)->yscale = (v))
+#define XAI_CNN_RESIZENEAREST3D_SET_XSHIFT(x, v)                   ((x)->xshift = (v))
+#define XAI_CNN_RESIZENEAREST3D_SET_YSHIFT(x, v)                   ((x)->yshift = (v))
+#define XAI_CNN_RESIZENEAREST3D_SET_FLAG_ALIGN_CORNERS(x, v)       ((x)->alignCorners = v)
+#define XAI_CNN_RESIZENEAREST3D_SET_FLAG_HALF_PIXEL_CENTERS(x, v)  ((x)->halfPixelCenters = v)
+#define XAI_CNN_RESIZENEAREST3D_SET_FRAME_WIDTH(x, v)              ((x)->inputFrameWidth = (v))
+#define XAI_CNN_RESIZENEAREST3D_SET_FRAME_HEIGHT(x, v)             ((x)->inputFrameHeight = (v))
+#define XAI_CNN_RESIZENEAREST3D_SET_XSCALE_FLT(x, v)               ((x)->xscaleFlt = (v))
+#define XAI_CNN_RESIZENEAREST3D_SET_YSCALE_FLT(x, v)               ((x)->yscaleFlt = (v))
+#define XAI_CNN_RESIZENEAREST3D_SET_XSHIFT_FLT(x, v)               ((x)->xshiftFlt = (v))
+#define XAI_CNN_RESIZENEAREST3D_SET_YSHIFT_FLT(x, v)               ((x)->yshiftFlt = (v))
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+#define XAI_CNN_RESIZENEAREST3D_SET_XSCALE_FLT16(x, v)             ((x)->xscaleFlt16 = (v))
+#define XAI_CNN_RESIZENEAREST3D_SET_YSCALE_FLT16(x, v)             ((x)->yscaleFlt16 = (v))
+#endif
+#define XAI_CNN_RESIZENEAREST3D_GET_QUANTIZATION_MODE(x)           ((x)->quantization_mode)
+#define XAI_CNN_RESIZENEAREST3D_SET_QUANTIZATION_MODE(x, v)        ((x)->quantization_mode = (v))
+
+
+typedef struct
+{
+  int16_t epsilon;                    // Always added or max val is considered based on tileFlag.
+  uint8_t normType;                   // (1= L1 Norm, 2 = L2 Norm)
+  uint8_t normAxis;                   // indicates the combination of axes along which to normalize
+  uint8_t channelShareFlag;           // indicates whether we have a single scale value or an array equal to number of channels
+  uint8_t tileFlag;                   // indicates whether the given tile is a first tile, last tile or neither of those
+  uint8_t tensorFlowFlag;             // describes the usage of epsilon
+  int8_t  quantScaleTableShift;       // shift value for scalar table
+  int8_t  rSqrtTableShift;            // shift value for recip square root table
+  int8_t  recipTableShift;            // shift value for recip table
+  int8_t  rSqrtIndexShift;            // shift value recip-square-root table index
+  int8_t  sumSquareShift;             // shift value for sum of squares
+  float  epsilonFlt;                 // floating point epsilon to be added to avoid divide by zero
+  float  sumSqScaleFlt;              // floating point scale value to be multiplied to sum of squares, to account for divide by N factor
+  int8_t  quantization_mode;
+} xai_cnn_normalize3D_params;
+
+#define XAI_CNN_NORMALIZE3D_GET_EPSILON(x)                               ((x)->epsilon)
+#define XAI_CNN_NORMALIZE3D_SET_EPSILON(x, v)                            ((x)->epsilon = (v))
+#define XAI_CNN_NORMALIZE3D_GET_NORM_TYPE(x)                             ((x)->normType)
+#define XAI_CNN_NORMALIZE3D_SET_NORM_TYPE(x, v)                          ((x)->normType = (v))
+#define XAI_CNN_NORMALIZE3D_GET_NORMALIZE_ALONG_WIDTH(x)                 ((x)->normAxis & CNN_NORMALIZE_ALONG_WIDTH)
+#define XAI_CNN_NORMALIZE3D_SET_NORMALIZE_ALONG_WIDTH(x)                 ((x)->normAxis = ((x)->normAxis | CNN_NORMALIZE_ALONG_WIDTH))
+#define XAI_CNN_NORMALIZE3D_RESET_NORMALIZE_ALONG_WIDTH(x)               ((x)->normAxis = ((x)->normAxis & ~CNN_NORMALIZE_ALONG_WIDTH))
+#define XAI_CNN_NORMALIZE3D_GET_NORMALIZE_ALONG_HEIGHT(x)                ((x)->normAxis & CNN_NORMALIZE_ALONG_HEIGHT)
+#define XAI_CNN_NORMALIZE3D_SET_NORMALIZE_ALONG_HEIGHT(x)                ((x)->normAxis = ((x)->normAxis | CNN_NORMALIZE_ALONG_HEIGHT))
+#define XAI_CNN_NORMALIZE3D_RESET_NORMALIZE_ALONG_HEIGHT(x)              ((x)->normAxis = ((x)->normAxis & ~CNN_NORMALIZE_ALONG_HEIGHT))
+#define XAI_CNN_NORMALIZE3D_GET_NORMALIZE_ALONG_WIDTH_AND_HEIGHT(x)      ((x)->normAxis & CNN_NORMALIZE_ALONG_WIDTH_AND_HEIGHT)
+#define XAI_CNN_NORMALIZE3D_SET_NORMALIZE_ALONG_WIDTH_AND_HEIGHT(x)      ((x)->normAxis = ((x)->normAxis | CNN_NORMALIZE_ALONG_WIDTH_AND_HEIGHT))
+#define XAI_CNN_NORMALIZE3D_RESET_NORMALIZE_ALONG_WIDTH_AND_HEIGHT(x)    ((x)->normAxis = ((x)->normAxis & ~CNN_NORMALIZE_ALONG_WIDTH_AND_HEIGHT))
+#define XAI_CNN_NORMALIZE3D_GET_NORMALIZE_ALONG_DEPTH(x)                 ((x)->normAxis & CNN_NORMALIZE_ALONG_DEPTH)
+#define XAI_CNN_NORMALIZE3D_SET_NORMALIZE_ALONG_DEPTH(x)                 ((x)->normAxis = ((x)->normAxis | CNN_NORMALIZE_ALONG_DEPTH))
+#define XAI_CNN_NORMALIZE3D_RESET_NORMALIZE_ALONG_DEPTH(x)               ((x)->normAxis = ((x)->normAxis & ~CNN_NORMALIZE_ALONG_DEPTH))
+#define XAI_CNN_NORMALIZE3D_GET_NORMALIZE_ALONG_BATCH(x)                 ((x)->normAxis & CNN_NORMALIZE_ALONG_BATCH)
+#define XAI_CNN_NORMALIZE3D_SET_NORMALIZE_ALONG_BATCH(x)                 ((x)->normAxis = ((x)->normAxis | CNN_NORMALIZE_ALONG_BATCH))
+#define XAI_CNN_NORMALIZE3D_RESET_NORMALIZE_ALONG_BATCH(x)               ((x)->normAxis = ((x)->normAxis & ~CNN_NORMALIZE_ALONG_BATCH))
+#define XAI_CNN_NORMALIZE3D_GET_NORMALIZE_ALONG_WIDTH_HEIGHT_DEPTH(x)    ((x)->normAxis & CNN_NORMALIZE_ALONG_WIDTH_AND_HEIGHT_AND_DEPTH)
+#define XAI_CNN_NORMALIZE3D_SET_NORMALIZE_ALONG_WIDTH_HEIGHT_DEPTH(x)    ((x)->normAxis = ((x)->normAxis | CNN_NORMALIZE_ALONG_WIDTH_AND_HEIGHT_AND_DEPTH))
+#define XAI_CNN_NORMALIZE3D_RESET_NORMALIZE_ALONG_WIDTH_HEIGHT_DEPTH(x)  ((x)->normAxis = ((x)->normAxis & ~CNN_NORMALIZE_ALONG_WIDTH_AND_HEIGHT_AND_DEPTH))
+#define XAI_CNN_NORMALIZE3D_GET_CHANNEL_SHARE_FLAG(x)                    ((x)->channelShareFlag & CNN_NORMALIZE_CHANNEL_SHARE_FLAG)
+#define XAI_CNN_NORMALIZE3D_SET_CHANNEL_SHARE_FLAG(x)                    ((x)->channelShareFlag = ((x)->channelShareFlag | CNN_NORMALIZE_CHANNEL_SHARE_FLAG))
+#define XAI_CNN_NORMALIZE3D_RESET_CHANNEL_SHARE_FLAG(x)                  ((x)->channelShareFlag = ((x)->channelShareFlag & ~CNN_NORMALIZE_CHANNEL_SHARE_FLAG))
+#define XAI_CNN_NORMALIZE3D_GET_TILE_FLAG(x)                             ((x)->tileFlag)
+#define XAI_CNN_NORMALIZE3D_SET_TILE_FLAG(x, v)                          ((x)->tileFlag = (v))
+#define XAI_CNN_NORMALIZE3D_GET_TENSORFLOW_FLAG(x)                       ((x)->tensorFlowFlag)
+#define XAI_CNN_NORMALIZE3D_SET_TENSORFLOW_FLAG(x, v)                    ((x)->tensorFlowFlag = (v))
+#define XAI_CNN_NORMALIZE3D_GET_RSQRT_TABLE_SHIFT(x)                     ((x)->rSqrtTableShift)
+#define XAI_CNN_NORMALIZE3D_SET_RSQRT_TABLE_SHIFT(x, v)                  ((x)->rSqrtTableShift = (v))
+#define XAI_CNN_NORMALIZE3D_GET_RECIP_TABLE_SHIFT(x)                     ((x)->recipTableShift)
+#define XAI_CNN_NORMALIZE3D_SET_RECIP_TABLE_SHIFT(x, v)                  ((x)->recipTableShift = (v))
+#define XAI_CNN_NORMALIZE3D_GET_RSQRT_INDEX_SHIFT(x)                     ((x)->rSqrtIndexShift)
+#define XAI_CNN_NORMALIZE3D_SET_RSQRT_INDEX_SHIFT(x, v)                  ((x)->rSqrtIndexShift = (v))
+#define XAI_CNN_NORMALIZE3D_GET_SUM_SQUARE_SHIFT(x)                      ((x)->sumSquareShift)
+#define XAI_CNN_NORMALIZE3D_SET_SUM_SQUARE_SHIFT(x, v)                   ((x)->sumSquareShift = (v))
+#define XAI_CNN_NORMALIZE3D_GET_QUANT_SCALE_TABLE_SHIFT(x)               ((x)->quantScaleTableShift)
+#define XAI_CNN_NORMALIZE3D_SET_QUANT_SCALE_TABLE_SHIFT(x, v)            ((x)->quantScaleTableShift = (v))
+#define XAI_CNN_NORMALIZE3D_GET_EPSILON_FLT(x)                           ((x)->epsilonFlt)
+#define XAI_CNN_NORMALIZE3D_SET_EPSILON_FLT(x, v)                        ((x)->epsilonFlt = (v))
+#define XAI_CNN_NORMALIZE3D_GET_SUM_SQ_SCALE_FLT(x)                      ((x)->sumSqScaleFlt)
+#define XAI_CNN_NORMALIZE3D_SET_SUM_SQ_SCALE_FLT(x, v)                   ((x)->sumSqScaleFlt = (v))
+#define XAI_CNN_NORMALIZE3D_GET_QUANTIZATION_MODE(x)                     ((x)->quantization_mode)
+#define XAI_CNN_NORMALIZE3D_SET_QUANTIZATION_MODE(x, v)                  ((x)->quantization_mode = (v))
+
+typedef struct
+{
+  uint8_t outputShift;   /* Shift value to bring the final value to 8b */
+  uint8_t tileFlag;
+  uint8_t meanShift;  /* set to a S to do the division */
+  uint8_t sqAccShift; /*  set to a shift value of accumulation of squares to 32 bits*/
+  int32_t meanScale;  /*Scale = (1<<S) /H*W */
+  uint8_t reluFlag;   /* Enable/Disable Relu at the output */
+  int32_t minVal;     /* minimum Value for clamping if reluFlag is set to 1 */
+  int32_t maxVal;     /* maximum Value for clamping if reluFlag is set to 1 */
+  int32_t axis;
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+  xb_f16  reluMinFlt;
+  xb_f16  reluMaxFlt;
+  xb_f16  epsilon;
+  xb_f16  meanScaleFlt;
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+  float reluMinFlt32;
+  float reluMaxFlt32;
+  float epsilonFlt32;
+  float meanScaleFlt32;
+#endif
+} xai_cnn_instance_norm_param;
+
+#define XAI_CNN_INSTANCE_NORM_GET_OUTPUTSHIFT(x)       ((x)->outputShift)
+#define XAI_CNN_INSTANCE_NORM_SET_OUTPUTSHIFT(x, v)    ((x)->outputShift = (v))
+#define XAI_CNN_INSTANCE_NORM_GET_TILEFLAG(x)          ((x)->tileFlag)
+#define XAI_CNN_INSTANCE_NORM_SET_TILEFLAG(x, v)       ((x)->tileFlag = (v))
+#define XAI_CNN_INSTANCE_NORM_GET_MEANSCALE(x)         ((x)->meanScale)
+#define XAI_CNN_INSTANCE_NORM_SET_MEANSCALE(x, v)      ((x)->meanScale = (v))
+#define XAI_CNN_INSTANCE_NORM_GET_MEANSHIFT(x)         ((x)->meanShift)
+#define XAI_CNN_INSTANCE_NORM_SET_MEANSHIFT(x, v)      ((x)->meanShift = (v))
+#define XAI_CNN_INSTANCE_NORM_GET_RELUFLAG(x)          ((x)->reluFlag)
+#define XAI_CNN_INSTANCE_NORM_SET_RELUFLAG(x, v)       ((x)->reluFlag = (v))
+#define XAI_CNN_INSTANCE_NORM_GET_MIN_VAL(x)           ((x)->minVal)
+#define XAI_CNN_INSTANCE_NORM_SET_MIN_VAL(x, v)        ((x)->minVal = (v))
+#define XAI_CNN_INSTANCE_NORM_GET_MAX_VAL(x)           ((x)->maxVal)
+#define XAI_CNN_INSTANCE_NORM_SET_MAX_VAL(x, v)        ((x)->maxVal = (v))
+#define XAI_CNN_INSTANCE_NORM_GET_SQACCSHIFT(x)        ((x)->sqAccShift)
+#define XAI_CNN_INSTANCE_NORM_SET_SQACCSHIFT(x, v)     ((x)->sqAccShift = (v))
+#define XAI_CNN_INSTANCE_NORM_GET_AXIS(x)              ((x)->axis)
+#define XAI_CNN_INSTANCE_NORM_SET_AXIS(x, v)           ((x)->axis = (v))
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+#define XAI_CNN_INSTANCE_NORM_GET_RELU_MIN_FLT(x)      ((x)->reluMinFlt)
+#define XAI_CNN_INSTANCE_NORM_SET_RELU_MIN_FLT(x, v)   ((x)->reluMinFlt = (v))
+#define XAI_CNN_INSTANCE_NORM_GET_RELU_MAX_FLT(x)      ((x)->reluMaxFlt)
+#define XAI_CNN_INSTANCE_NORM_SET_RELU_MAX_FLT(x, v)   ((x)->reluMaxFlt = (v))
+#define XAI_CNN_INSTANCE_NORM_GET_EPSILON_FLT(x)       ((x)->epsilon)
+#define XAI_CNN_INSTANCE_NORM_SET_EPSILON_FLT(x, v)    ((x)->epsilon = (v))
+#define XAI_CNN_INSTANCE_NORM_GET_MEANSCALE_FLT(x)     ((x)->meanScaleFlt)
+#define XAI_CNN_INSTANCE_NORM_SET_MEANSCALE_FLT(x, v)  ((x)->meanScaleFlt = (v))
+#define XAI_CNN_INSTANCE_NORM_GET_MEANSCALE_FLT(x)     ((x)->meanScaleFlt)
+#define XAI_CNN_INSTANCE_NORM_SET_MEANSCALE_FLT(x, v)  ((x)->meanScaleFlt = (v))
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+#define XAI_CNN_INSTANCE_NORM_GET_RELU_MIN_FLT32(x)      ((x)->reluMinFlt32)
+#define XAI_CNN_INSTANCE_NORM_SET_RELU_MIN_FLT32(x, v)   ((x)->reluMinFlt32 = (v))
+#define XAI_CNN_INSTANCE_NORM_GET_RELU_MAX_FLT32(x)      ((x)->reluMaxFlt32)
+#define XAI_CNN_INSTANCE_NORM_SET_RELU_MAX_FLT32(x, v)   ((x)->reluMaxFlt32 = (v))
+#define XAI_CNN_INSTANCE_NORM_GET_EPSILON_FLT32(x)       ((x)->epsilonFlt32)
+#define XAI_CNN_INSTANCE_NORM_SET_EPSILON_FLT32(x, v)    ((x)->epsilonFlt32 = (v))
+#define XAI_CNN_INSTANCE_NORM_GET_MEANSCALE_FLT32(x)     ((x)->meanScaleFlt32)
+#define XAI_CNN_INSTANCE_NORM_SET_MEANSCALE_FLT32(x, v)  ((x)->meanScaleFlt32 = (v))
+#define XAI_CNN_INSTANCE_NORM_GET_MEANSCALE_FLT32(x)     ((x)->meanScaleFlt32)
+#define XAI_CNN_INSTANCE_NORM_SET_MEANSCALE_FLT32(x, v)  ((x)->meanScaleFlt32 = (v))
+#endif
+
+typedef struct
+{
+  uint32_t valueR;   /* constant value which needs to be divided with divisor
+                        for each channel , can take a maximum range of (2^15) - 1
+                        for I8 input and 2^31-1 for S16 input */
+  uint8_t outShift;  /* Shift value applied to scaled output */
+} xai_cnn_divide3D_params;
+
+#define XAI_CNN_CHANNELWISE_DIVIDE_GET_VALUE_R(x)       ((x)->valueR)
+#define XAI_CNN_CHANNELWISE_DIVIDE_SET_VALUE_R(x, v)    ((x)->valueR = (v))
+#define XAI_CNN_CHANNELWISE_DIVIDE_GET_OUT_SHIFT(x)     ((x)->outShift)
+#define XAI_CNN_CHANNELWISE_DIVIDE_SET_OUT_SHIFT(x, v)  ((x)->outShift = (v))
+
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+
+#define CNNA_CONV_F16_FLAG_RELU      1
+#define CNNA_CONV_F16_FLAG_LEFTEDGE  (1 << 1)
+#define CNNA_CONV_F16_FLAG_TOPEDGE   (1 << 2)
+#endif // #if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+typedef struct
+{
+  uint16_t spatialScaleShiftX;      /* Shift value to apply for spatial scale operations in the X direction  */
+  uint16_t spatialScaleShiftY;      /* Shift value to apply for spatial scale operations in the Y direction  */
+  uint16_t outShift;                /* Is either 7, 8, 15, 16, or 23 depending on the datatype of input      */
+  int32_t  extrapolationValue;      /* Extrapolate value to be used during extrapolation                     */
+  int32_t  roiStride;               /* ROI coordinates' stride                                               */
+  uint8_t  tensorFlowFlag;          /* Flag to change box coordinates ordering from Caffe2 to TensorFlow         */
+} xai_cnn_cropResize3D_params;
+
+#define XAI_CNN_CROP_RESIZE3D_GET_SPATIAL_SCALE_SHIFTX(x)     ((x)->spatialScaleShiftX)
+#define XAI_CNN_CROP_RESIZE3D_SET_SPATIAL_SCALE_SHIFTX(x, v)  ((x)->spatialScaleShiftX = (v))
+#define XAI_CNN_CROP_RESIZE3D_GET_SPATIAL_SCALE_SHIFTY(x)     ((x)->spatialScaleShiftY)
+#define XAI_CNN_CROP_RESIZE3D_SET_SPATIAL_SCALE_SHIFTY(x, v)  ((x)->spatialScaleShiftY = (v))
+#define XAI_CNN_CROP_RESIZE3D_GET_OUT_SHIFT(x)                ((x)->outShift)
+#define XAI_CNN_CROP_RESIZE3D_SET_OUT_SHIFT(x, v)             ((x)->outShift = (v))
+#define XAI_CNN_CROP_RESIZE3D_GET_EXTRAPOLATION_VALUE(x)      ((x)->extrapolationValue)
+#define XAI_CNN_CROP_RESIZE3D_SET_EXTRAPOLATION_VALUE(x, v)   ((x)->extrapolationValue = (v))
+#define XAI_CNN_CROP_RESIZE3D_GET_ROI_STRIDE(x)               ((x)->roiStride)
+#define XAI_CNN_CROP_RESIZE3D_SET_ROI_STRIDE(x, v)            ((x)->roiStride = (v))
+#define XAI_CNN_CROP_RESIZE3D_GET_TENSORFLOW_FLAG(x)          ((x)->tensorFlowFlag)
+#define XAI_CNN_CROP_RESIZE3D_SET_TENSORFLOW_FLAG(x, v)       ((x)->tensorFlowFlag = (v))
+
+typedef struct
+{
+  uint8_t outputShift;   /* Shift value to bring the final value to 8b */
+  uint8_t reluFlag;      /* Enable/Disable Relu at the output */
+  int32_t minVal;        /* minimum Value for clamping if reluFlag is set to 1 */
+  int32_t maxVal;        /* maximum Value for clamping if reluFlag is set to 1 */
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+  xb_f16  reluMinFlt;
+  xb_f16  reluMaxFlt;
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+  float reluMinFlt32;
+  float reluMaxFlt32;
+#endif
+} xai_cnn_batchnorm_params;
+
+#define XAI_CNN_BATCHNORM_GET_OUTPUTSHIFT(x)      ((x)->outputShift)
+#define XAI_CNN_BATCHNORM_SET_OUTPUTSHIFT(x, v)   ((x)->outputShift = (v))
+#define XAI_CNN_BATCHNORM_GET_RELUFLAG(x)         ((x)->reluFlag)
+#define XAI_CNN_BATCHNORM_SET_RELUFLAG(x, v)      ((x)->reluFlag = (v))
+#define XAI_CNN_BATCHNORM_GET_MIN_VAL(x)          ((x)->minVal)
+#define XAI_CNN_BATCHNORM_SET_MIN_VAL(x, v)       ((x)->minVal = (v))
+#define XAI_CNN_BATCHNORM_GET_MAX_VAL(x)          ((x)->maxVal)
+#define XAI_CNN_BATCHNORM_SET_MAX_VAL(x, v)       ((x)->maxVal = (v))
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+#define XAI_CNN_BATCHNORM_GET_RELU_MIN_FLT(x)     ((x)->reluMinFlt)
+#define XAI_CNN_BATCHNORM_SET_RELU_MIN_FLT(x, v)  ((x)->reluMinFlt = (v))
+#define XAI_CNN_BATCHNORM_GET_RELU_MAX_FLT(x)     ((x)->reluMaxFlt)
+#define XAI_CNN_BATCHNORM_SET_RELU_MAX_FLT(x, v)  ((x)->reluMaxFlt = (v))
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+#define XAI_CNN_BATCHNORM_GET_RELU_MIN_FLT32(x)     ((x)->reluMinFlt32)
+#define XAI_CNN_BATCHNORM_SET_RELU_MIN_FLT32(x, v)  ((x)->reluMinFlt32 = (v))
+#define XAI_CNN_BATCHNORM_GET_RELU_MAX_FLT32(x)     ((x)->reluMaxFlt32)
+#define XAI_CNN_BATCHNORM_SET_RELU_MAX_FLT32(x, v)  ((x)->reluMaxFlt32 = (v))
+#endif
+
+typedef struct
+{
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+  xb_f16 lambdaF16;
+  xb_f16 alphaF16;
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+  float lambdaF32;
+  float alphaF32;
+#endif
+} xai_cnn_selu_params;
+
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+#define XAI_CNN_SELU_GET_LAMBDA16(x)     ((x)->lambdaF16)
+#define XAI_CNN_SELU_SET_LAMBDA16(x, v)  ((x)->lambdaF16 = (v))
+#define XAI_CNN_SELU_GET_ALPHA16(x)      ((x)->alphaF16)
+#define XAI_CNN_SELU_SET_ALPHA16(x, v)   ((x)->alphaF16 = (v))
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION))
+#define XAI_CNN_SELU_GET_LAMBDA32(x)     ((x)->lambdaF32)
+#define XAI_CNN_SELU_SET_LAMBDA32(x, v)  ((x)->lambdaF32 = (v))
+#define XAI_CNN_SELU_GET_ALPHA32(x)      ((x)->alphaF32)
+#define XAI_CNN_SELU_SET_ALPHA32(x, v)   ((x)->alphaF32 = (v))
+#endif
+
+typedef struct
+{
+  int16_t  ZeroIn;      /* Zero Point value for Input Tile*/
+  int16_t  ZeroOut;     /* Zero Point value for output*/
+  uint16_t renormScale; /* Scale applied on (input - ZeroIn) */
+  uint8_t  renormShift; /* Shift applied to obtain S8 output */
+} xai_cnn_renorm_params;
+
+#define XAI_CNN_RENORM_GET_ZEROIN(x)          ((x)->ZeroIn)
+#define XAI_CNN_RENORM_SET_ZEROIN(x, v)       ((x)->ZeroIn = (v))
+#define XAI_CNN_RENORM_GET_ZEROOUT(x)         ((x)->ZeroOut)
+#define XAI_CNN_RENORM_SET_ZEROOUT(x, v)      ((x)->ZeroOut = (v))
+#define XAI_CNN_RENORM_GET_RENORMSCALE(x)     ((x)->renormScale)
+#define XAI_CNN_RENORM_SET_RENORMSCALE(x, v)  ((x)->renormScale = (v))
+#define XAI_CNN_RENORM_GET_RENORMSHIFT(x)     ((x)->renormShift)
+#define XAI_CNN_RENORM_SET_RENORMSHIFT(x, v)  ((x)->renormShift = (v))
+
+#if (XCHAL_HAVE_HIFI1 || XCHAL_HAVE_HIFI3Z || XCHAL_HAVE_HIFI4 || XCHAL_HAVE_HIFI5)
+typedef struct
+{
+  int32_t ZeroIn;
+  int32_t ZeroOut;
+  int32_t requantScale;
+  int32_t requantShift;
+  int8_t  quantization_mode;
+} xai_cnn_tfl_requantize_params;
+
+#define XAI_CNN_REQUANT_GET_ZEROIN(x)                ((x)->ZeroIn)
+#define XAI_CNN_REQUANT_SET_ZEROIN(x, v)             ((x)->ZeroIn = (v))
+#define XAI_CNN_REQUANT_GET_ZEROOUT(x)               ((x)->ZeroOut)
+#define XAI_CNN_REQUANT_SET_ZEROOUT(x, v)            ((x)->ZeroOut = (v))
+#define XAI_CNN_REQUANT_GET_REQUANTSCALE(x)          ((x)->requantScale)
+#define XAI_CNN_REQUANT_SET_REQUANTSCALE(x, v)       ((x)->requantScale = (v))
+#define XAI_CNN_REQUANT_GET_REQUANTSHIFT(x)          ((x)->requantShift)
+#define XAI_CNN_REQUANT_SET_REQUANTSHIFT(x, v)       ((x)->requantShift = (v))
+#define XAI_CNN_REQUANT_GET_QUANTIZATION_MODE(x)     ((x)->quantization_mode)
+#define XAI_CNN_REQUANT_SET_QUANTIZATION_MODE(x, v)  ((x)->quantization_mode = (v))
+
+#else
+typedef struct
+{
+  int16_t ZeroIn;
+  int16_t ZeroOut;
+  int32_t requantScale;
+  int32_t requantShift;
+  int8_t  quantization_mode;
+} xai_cnn_tfl_requantize_params;
+
+#define XAI_CNN_REQUANT_GET_ZEROIN(x)                ((x)->ZeroIn)
+#define XAI_CNN_REQUANT_SET_ZEROIN(x, v)             ((x)->ZeroIn = (v))
+#define XAI_CNN_REQUANT_GET_ZEROOUT(x)               ((x)->ZeroOut)
+#define XAI_CNN_REQUANT_SET_ZEROOUT(x, v)            ((x)->ZeroOut = (v))
+#define XAI_CNN_REQUANT_GET_REQUANTSCALE(x)          ((x)->requantScale)
+#define XAI_CNN_REQUANT_SET_REQUANTSCALE(x, v)       ((x)->requantScale = (v))
+#define XAI_CNN_REQUANT_GET_REQUANTSHIFT(x)          ((x)->requantShift)
+#define XAI_CNN_REQUANT_SET_REQUANTSHIFT(x, v)       ((x)->requantShift = (v))
+#define XAI_CNN_REQUANT_GET_QUANTIZATION_MODE(x)     ((x)->quantization_mode)
+#define XAI_CNN_REQUANT_SET_QUANTIZATION_MODE(x, v)  ((x)->quantization_mode = (v))
+#endif
+
+typedef struct
+{
+  uint16_t outputScale; /* Scaling factor for Output */
+  uint8_t  outputShift; /* Shift value to bring the output to 16b */
+} xai_cnn_relu_params;
+
+#define XAI_CNN_RELU_GET_OUTPUTSCALE(x)     ((x)->outputScale)
+#define XAI_CNN_RELU_SET_OUTPUTSCALE(x, v)  ((x)->outputScale = (v))
+#define XAI_CNN_RELU_GET_OUTPUTSHIFT(x)     ((x)->outputShift)
+#define XAI_CNN_RELU_SET_OUTPUTSHIFT(x, v)  ((x)->outputShift = (v))
+
+typedef struct
+{
+  uint8_t  config;       // Determines reduction across particular dimensions
+  uint8_t  tileFlag;     // Determines which tile is currently being processed
+                         // 0-> intermediate tile, 1-> first tile, 2 --> last tile, 3 --> first and last tile
+  int32_t  fixUpInit;    // The fixUp term that is used to incorporte Zero Points
+  uint8_t  accShiftU;    // The value by which the accumulated value is right shifted
+  uint8_t  outShiftU;    // The value by which the intermediate output value is right shifted
+  uint16_t outScale;     // The value by which acc-shifted value is multiplied to give intermediate output value
+  uint8_t  enableReLu;   // Indicates if relu functionality needs to be enabled (1) or not (0)
+  int64_t  reluMin;      // The lower limit value which will be used for clamping the outputs
+  int64_t  reluMax;      // The upper limit value which will be used for clamping the outputs
+  bool     take_abs;     // Indicates if absolute value needs to be taken (true) or not (false)
+  int32_t  redEleCount;  // Total number of elements reduced in the output
+} xai_cnn_reduce_params;
+
+#define XAI_CNN_REDUCE_GET_CONFIG(x)                     ((x)->config)
+#define XAI_CNN_REDUCE_GET_TILEFLAG(x)                   ((x)->tileFlag)
+#define XAI_CNN_REDUCE_GET_FIXUPINIT(x)                  ((x)->fixUpInit)
+#define XAI_CNN_REDUCE_GET_ACCSHIFT(x)                   ((x)->accShiftU)
+#define XAI_CNN_REDUCE_GET_OUTPUTSHIFT(x)                ((x)->outShiftU)
+#define XAI_CNN_REDUCE_GET_OUTPUTSCALE(x)                ((x)->outScale)
+#define XAI_CNN_REDUCE_GET_FLAG_RELU(x)                  ((x)->enableReLu)
+#define XAI_CNN_REDUCE_GET_RELU_MIN(x)                   ((x)->reluMin)
+#define XAI_CNN_REDUCE_GET_RELU_MAX(x)                   ((x)->reluMax)
+#define XAI_CNN_REDUCE_GET_TAKEABS(x)                    ((x)->take_abs)
+#define XAI_CNN_REDUCE_GET_REDUCED_ELEMENTS_COUNT(x)     ((x)->redEleCount)
+
+#define XAI_CNN_REDUCE_SET_CONFIG(x, v)                  ((x)->config = v)
+#define XAI_CNN_REDUCE_SET_TILEFLAG(x, v)                ((x)->tileFlag = v)
+#define XAI_CNN_REDUCE_SET_FIXUPINIT(x, v)               ((x)->fixUpInit = v)
+#define XAI_CNN_REDUCE_SET_ACCSHIFT(x, v)                ((x)->accShiftU = v)
+#define XAI_CNN_REDUCE_SET_OUTPUTSHIFT(x, v)             ((x)->outShiftU = v)
+#define XAI_CNN_REDUCE_SET_OUTPUTSCALE(x, v)             ((x)->outScale = v)
+#define XAI_CNN_REDUCE_SET_FLAG_RELU(x, v)               ((x)->enableReLu = v)
+#define XAI_CNN_REDUCE_SET_RELU_MIN(x, v)                ((x)->reluMin = v)
+#define XAI_CNN_REDUCE_SET_RELU_MAX(x, v)                ((x)->reluMax = v)
+#define XAI_CNN_REDUCE_SET_TAKEABS(x, v)                 ((x)->take_abs = v)
+#define XAI_CNN_REDUCE_SET_REDUCED_ELEMENTS_COUNT(x, v)  ((x)->redEleCount = v)
+
+#define XAI_CNN_REDUCE_DIM1               (0x1)
+#define XAI_CNN_REDUCE_DIM2               (0x2)
+#define XAI_CNN_REDUCE_DIM3               (0x4)
+#define XAI_CNN_REDUCE_DIM4               (0x8)
+
+#define XAI_CNN_REDUCE_DIM12              (XAI_CNN_REDUCE_DIM1 | XAI_CNN_REDUCE_DIM2)
+#define XAI_CNN_REDUCE_DIM13              (XAI_CNN_REDUCE_DIM1 | XAI_CNN_REDUCE_DIM3)
+#define XAI_CNN_REDUCE_DIM14              (XAI_CNN_REDUCE_DIM1 | XAI_CNN_REDUCE_DIM4)
+#define XAI_CNN_REDUCE_DIM23              (XAI_CNN_REDUCE_DIM2 | XAI_CNN_REDUCE_DIM3)
+#define XAI_CNN_REDUCE_DIM24              (XAI_CNN_REDUCE_DIM2 | XAI_CNN_REDUCE_DIM4)
+
+#define XAI_CNN_REDUCE_DIM34              (XAI_CNN_REDUCE_DIM3 | XAI_CNN_REDUCE_DIM4)
+#define XAI_CNN_REDUCE_DIM123             (XAI_CNN_REDUCE_DIM1 | XAI_CNN_REDUCE_DIM2 | XAI_CNN_REDUCE_DIM3)
+#define XAI_CNN_REDUCE_DIM124             (XAI_CNN_REDUCE_DIM1 | XAI_CNN_REDUCE_DIM2 | XAI_CNN_REDUCE_DIM4)
+#define XAI_CNN_REDUCE_DIM134             (XAI_CNN_REDUCE_DIM1 | XAI_CNN_REDUCE_DIM3 | XAI_CNN_REDUCE_DIM4)
+
+#define XAI_CNN_REDUCE_DIM234             (XAI_CNN_REDUCE_DIM2 | XAI_CNN_REDUCE_DIM3 | XAI_CNN_REDUCE_DIM4)
+#define XAI_CNN_REDUCE_DIM1234            (XAI_CNN_REDUCE_DIM1 | XAI_CNN_REDUCE_DIM2 | XAI_CNN_REDUCE_DIM3 | XAI_CNN_REDUCE_DIM4)
+
+#define XAI_CNN_REDUCE_INTERMEDIATE_TILE  0
+#define XAI_CNN_REDUCE_FIRST_TILE         1
+#define XAI_CNN_REDUCE_LAST_TILE          2
+#define XAI_CNN_REDUCE_FIRST_LAST_TILE    3
+
+/* Matrix Multiplication Params */
+typedef struct
+{
+  uint8_t  accumShift;                   // Accumulator Shift - Shift to convert accumulator data to 16 bit
+  uint16_t outputScale;                  // Amount by which shifted data is scaled
+  uint8_t  outputShift;                  // Shift amount to convert the scaled data to 16 bit
+  int8_t   zeroPointIn1;                 // zero point for assymetric input1 data
+  int8_t   zeroPointIn2;                 // zero point for assymetric input2 data
+} xai_cnn_matmul_params;
+
+#define XAI_CNN_MATMUL_GET_ACCUM_SHIFT(x)      ((x)->accumShift)
+#define XAI_CNN_MATMUL_SET_ACCUM_SHIFT(x, v)   ((x)->accumShift = (v))
+#define XAI_CNN_MATMUL_GET_OUTPUT_SCALE(x)     ((x)->outputScale)
+#define XAI_CNN_MATMUL_SET_OUTPUT_SCALE(x, v)  ((x)->outputScale = (v))
+#define XAI_CNN_MATMUL_GET_OUTPUT_SHIFT(x)     ((x)->outputShift)
+#define XAI_CNN_MATMUL_SET_OUTPUT_SHIFT(x, v)  ((x)->outputShift = (v))
+#define XAI_CNN_MATMUL_GET_ZERO_POINT1(x)      ((x)->zeroPointIn1)
+#define XAI_CNN_MATMUL_SET_ZERO_POINT1(x, v)   ((x)->zeroPointIn1 = (v))
+#define XAI_CNN_MATMUL_GET_ZERO_POINT2(x)      ((x)->zeroPointIn2)
+#define XAI_CNN_MATMUL_SET_ZERO_POINT2(x, v)   ((x)->zeroPointIn2 = (v))
+
+/* Matrix Multiplication TFL Params */
+typedef struct
+{
+  int32_t outputScale;
+  int32_t outputShift;
+  int32_t lhsTranspose;  // Can be 0 or 1
+  int32_t rhsTranspose;  // Can be 0 or 1
+  int32_t lhsOffset;
+  int32_t rhsOffset;
+  int32_t outOffset;
+  int32_t lhsBatch0;
+  int32_t lhsBatch1;
+  int32_t lhsBatch2;
+  int32_t rhsBatch0;
+  int32_t rhsBatch1;
+  int32_t rhsBatch2;
+  int32_t outBatch0;
+  int32_t outBatch1;
+  int32_t outBatch2;
+  int8_t  quantization_mode;
+} xai_cnn_tfl_matmul_params;
+
+#define XAI_CNN_MATMUL_GET_OUTPUT_SCALE_TFL(x)      ((x)->outputScale)
+#define XAI_CNN_MATMUL_SET_OUTPUT_SCALE_TFL(x, v)   ((x)->outputScale = (v))
+#define XAI_CNN_MATMUL_GET_OUTPUT_SHIFT_TFL(x)      ((x)->outputShift)
+#define XAI_CNN_MATMUL_SET_OUTPUT_SHIFT_TFL(x, v)   ((x)->outputShift = (v))
+#define XAI_CNN_MATMUL_GET_LHS_TRANSPOSE_TFL(x)     ((x)->lhsTranspose)
+#define XAI_CNN_MATMUL_SET_LHS_TRANSPOSE_TFL(x, v)  ((x)->lhsTranspose = (v))
+#define XAI_CNN_MATMUL_GET_RHS_TRANSPOSE_TFL(x)     ((x)->rhsTranspose)
+#define XAI_CNN_MATMUL_SET_RHS_TRANSPOSE_TFL(x, v)  ((x)->rhsTranspose = (v))
+#define XAI_CNN_MATMUL_GET_LHS_OFFSET_TFL(x)        ((x)->lhsOffset)
+#define XAI_CNN_MATMUL_SET_LHS_OFFSET_TFL(x, v)     ((x)->lhsOffset = (v))
+#define XAI_CNN_MATMUL_GET_RHS_OFFSET_TFL(x)        ((x)->rhsOffset)
+#define XAI_CNN_MATMUL_SET_RHS_OFFSET_TFL(x, v)     ((x)->rhsOffset = (v))
+#define XAI_CNN_MATMUL_GET_OUT_OFFSET_TFL(x)        ((x)->outOffset)
+#define XAI_CNN_MATMUL_SET_OUT_OFFSET_TFL(x, v)     ((x)->outOffset = (v))
+#define XAI_CNN_MATMUL_GET_LHS_BATCH0_TFL(x)        ((x)->lhsBatch0)
+#define XAI_CNN_MATMUL_SET_LHS_BATCH0_TFL(x, v)     ((x)->lhsBatch0 = (v))
+#define XAI_CNN_MATMUL_GET_LHS_BATCH1_TFL(x)        ((x)->lhsBatch1)
+#define XAI_CNN_MATMUL_SET_LHS_BATCH1_TFL(x, v)     ((x)->lhsBatch1 = (v))
+#define XAI_CNN_MATMUL_GET_LHS_BATCH2_TFL(x)        ((x)->lhsBatch2)
+#define XAI_CNN_MATMUL_SET_LHS_BATCH2_TFL(x, v)     ((x)->lhsBatch2 = (v))
+#define XAI_CNN_MATMUL_GET_RHS_BATCH0_TFL(x)        ((x)->rhsBatch0)
+#define XAI_CNN_MATMUL_SET_RHS_BATCH0_TFL(x, v)     ((x)->rhsBatch0 = (v))
+#define XAI_CNN_MATMUL_GET_RHS_BATCH1_TFL(x)        ((x)->rhsBatch1)
+#define XAI_CNN_MATMUL_SET_RHS_BATCH1_TFL(x, v)     ((x)->rhsBatch1 = (v))
+#define XAI_CNN_MATMUL_GET_RHS_BATCH2_TFL(x)        ((x)->rhsBatch2)
+#define XAI_CNN_MATMUL_SET_RHS_BATCH2_TFL(x, v)     ((x)->rhsBatch2 = (v))
+#define XAI_CNN_MATMUL_GET_OUT_BATCH0_TFL(x)        ((x)->outBatch0)
+#define XAI_CNN_MATMUL_SET_OUT_BATCH0_TFL(x, v)     ((x)->outBatch0 = (v))
+#define XAI_CNN_MATMUL_GET_OUT_BATCH1_TFL(x)        ((x)->outBatch1)
+#define XAI_CNN_MATMUL_SET_OUT_BATCH1_TFL(x, v)     ((x)->outBatch1 = (v))
+#define XAI_CNN_MATMUL_GET_OUT_BATCH2_TFL(x)        ((x)->outBatch2)
+#define XAI_CNN_MATMUL_SET_OUT_BATCH2_TFL(x, v)     ((x)->outBatch2 = (v))
+#define XAI_CNN_MATMUL_GET_QUANTIZATION_MODE(x)     ((x)->quantization_mode)
+#define XAI_CNN_MATMUL_SET_QUANTIZATION_MODE(x, v)  ((x)->quantization_mode = (v))
+
+/*Crop3DWithStride Params*/
+typedef struct
+{
+  int32_t offsH;
+  int32_t offsW;
+  int32_t offsD;
+  int32_t strideH;
+  int32_t strideW;
+  int32_t strideD;
+} xai_cnn_crop3DWithStride_params;
+
+#define XAI_CNN_CROP3DWITHSTRIDE_GET_OFFSD(x)       ((x)->offsD);
+#define XAI_CNN_CROP3DWITHSTRIDE_GET_OFFSW(x)       ((x)->offsW);
+#define XAI_CNN_CROP3DWITHSTRIDE_GET_OFFSH(x)       ((x)->offsH);
+#define XAI_CNN_CROP3DWITHSTRIDE_GET_STRIDED(x)     ((x)->strideD);
+#define XAI_CNN_CROP3DWITHSTRIDE_GET_STRIDEW(x)     ((x)->strideW);
+#define XAI_CNN_CROP3DWITHSTRIDE_GET_STRIDEH(x)     ((x)->strideH);
+#define XAI_CNN_CROP3DWITHSTRIDE_SET_OFFSD(x, v)    ((x)->offsD = (v))
+#define XAI_CNN_CROP3DWITHSTRIDE_SET_OFFSW(x, v)    ((x)->offsW = (v))
+#define XAI_CNN_CROP3DWITHSTRIDE_SET_OFFSH(x, v)    ((x)->offsH = (v))
+#define XAI_CNN_CROP3DWITHSTRIDE_SET_STRIDED(x, v)  ((x)->strideD = (v))
+#define XAI_CNN_CROP3DWITHSTRIDE_SET_STRIDEW(x, v)  ((x)->strideW = (v))
+#define XAI_CNN_CROP3DWITHSTRIDE_SET_STRIDEH(x, v)  ((x)->strideH = (v))
+
+typedef struct
+{
+  float   scale;
+  int32_t offset;
+  int32_t axis;
+} xai_cnn_quantDequantA_params;
+#define XAI_CNN_QUANT_DEQUANT_GET_SCALE(x)      ((x)->scale)
+#define XAI_CNN_QUANT_DEQUANT_SET_SCALE(x, v)   ((x)->scale = (v))
+#define XAI_CNN_QUANT_DEQUANT_GET_OFFSET(x)     ((x)->offset)
+#define XAI_CNN_QUANT_DEQUANT_SET_OFFSET(x, v)  ((x)->offset = (v))
+#define XAI_CNN_QUANT_DEQUANT_GET_AXIS(x)       ((x)->axis)
+#define XAI_CNN_QUANT_DEQUANT_SET_AXIS(x, v)    ((x)->axis = (v))
+
+typedef struct
+{
+  xai_cnn_conv_params        fcInputParamIG;
+  xai_cnn_conv_params        fcInputParamFG;
+  xai_cnn_conv_params        fcInputParamOG;
+  xai_cnn_conv_params        fcInputParamMI;
+  xai_cnn_conv_params        fcHiddenParamIG;
+  xai_cnn_conv_params        fcHiddenParamFG;
+  xai_cnn_conv_params        fcHiddenParamOG;
+  xai_cnn_conv_params        fcHiddenParamMI;
+
+  xai_cnn_sigmoid_params     sigmoidParamIG;
+  xai_cnn_sigmoid_params     sigmoidParamFG;
+  xai_cnn_sigmoid_params     sigmoidParamOG;
+  xai_cnn_tanh_params        tanhParamMI;
+
+  xai_cnn_tfl_eltwise_params eltMulParamHS1;
+  xai_cnn_tfl_eltwise_params eltMulParamHS2;
+
+  xai_cnn_tanh_params        tanhParamCS;
+  xai_cnn_tfl_eltwise_params eltMulParamCS;
+
+  int16_t                    clipMin;
+  int16_t                    clipMax;
+
+  int32_t                    timeMajorAxis;
+  int32_t                    direction;
+} xai_lstm_tfl_params;
+
+#define XAI_CNN_LSTM_GET_FC_INPUT_IG_PARAM(x)         ((x)->fcInputParamIG)
+#define XAI_CNN_LSTM_SET_FC_INPUT_IG_PARAM(x, v)      ((x)->fcInputParamIG = (v))
+#define XAI_CNN_LSTM_GET_FC_INPUT_FG_PARAM(x)         ((x)->fcInputParamFG)
+#define XAI_CNN_LSTM_SET_FC_INPUT_FG_PARAM(x, v)      ((x)->fcInputParamFG = (v))
+#define XAI_CNN_LSTM_GET_FC_INPUT_OG_PARAM(x)         ((x)->fcInputParamOG)
+#define XAI_CNN_LSTM_SET_FC_INPUT_OG_PARAM(x, v)      ((x)->fcInputParamOG = (v))
+#define XAI_CNN_LSTM_GET_FC_INPUT_MI_PARAM(x)         ((x)->fcInputParamMI)
+#define XAI_CNN_LSTM_SET_FC_INPUT_MI_PARAM(x, v)      ((x)->fcInputParamMI = (v))
+
+#define XAI_CNN_LSTM_GET_FC_HIDDEN_IG_PARAM(x)        ((x)->fcHiddenParamIG)
+#define XAI_CNN_LSTM_SET_FC_HIDDEN_IG_PARAM(x, v)     ((x)->fcHiddenParamIG = (v))
+#define XAI_CNN_LSTM_GET_FC_HIDDEN_FG_PARAM(x)        ((x)->fcHiddenParamFG)
+#define XAI_CNN_LSTM_SET_FC_HIDDEN_FG_PARAM(x, v)     ((x)->fcHiddenParamFG = (v))
+#define XAI_CNN_LSTM_GET_FC_HIDDEN_OG_PARAM(x)        ((x)->fcHiddenParamOG)
+#define XAI_CNN_LSTM_SET_FC_HIDDEN_OG_PARAM(x, v)     ((x)->fcHiddenParamOG = (v))
+#define XAI_CNN_LSTM_GET_FC_HIDDEN_MI_PARAM(x)        ((x)->fcHiddenParamMI)
+#define XAI_CNN_LSTM_SET_FC_HIDDEN_MI_PARAM(x, v)     ((x)->fcHiddenParamMI = (v))
+
+#define XAI_CNN_LSTM_GET_SIGMOID_IG_PARAM(x)          ((x)->sigmoidParamIG)
+#define XAI_CNN_LSTM_SET_SIGMOID_IG_PARAM(x, v)       ((x)->sigmoidParamIG = (v))
+#define XAI_CNN_LSTM_GET_SIGMOID_FG_PARAM(x)          ((x)->sigmoidParamFG)
+#define XAI_CNN_LSTM_SET_SIGMOID_FG_PARAM(x, v)       ((x)->sigmoidParamFG = (v))
+#define XAI_CNN_LSTM_GET_SIGMOID_OG_PARAM(x)          ((x)->sigmoidParamOG)
+#define XAI_CNN_LSTM_SET_SIGMOID_OG_PARAM(x, v)       ((x)->sigmoidParamOG = (v))
+#define XAI_CNN_LSTM_GET_TANH_MI_PARAM(x)             ((x)->tanhParamMI)
+#define XAI_CNN_LSTM_SET_TANH_MI_PARAM(x, v)          ((x)->tanhParamMI = (v))
+
+#define XAI_CNN_LSTM_GET_ELTWISE_MUL_HS1_PARAM(x)     ((x)->eltMulParamHS1)
+#define XAI_CNN_LSTM_SET_ELTWISE_MUL_HS1_PARAM(x, v)  ((x)->eltMulParamHS1 = (v))
+#define XAI_CNN_LSTM_GET_ELTWISE_MUL_HS2_PARAM(x)     ((x)->eltMulParamHS2)
+#define XAI_CNN_LSTM_SET_ELTWISE_MUL_HS2_PARAM(x, v)  ((x)->eltMulParamHS2 = (v))
+
+#define XAI_CNN_LSTM_GET_TANH_CS_PARAM(x)             ((x)->tanhParamCS)
+#define XAI_CNN_LSTM_SET_TANH_CS_PARAM(x, v)          ((x)->tanhParamCS = (v))
+#define XAI_CNN_LSTM_GET_ELTWISE_MUL_CS_PARAM(x)      ((x)->eltMulParamCS)
+#define XAI_CNN_LSTM_SET_ELTWISE_MUL_CS_PARAM(x, v)   ((x)->eltMulParamCS = (v))
+
+#define XAI_CNN_LSTM_GET_CLIP_MIN(x)                  ((x)->clipMin)
+#define XAI_CNN_LSTM_SET_CLIP_MIN(x, v)               ((x)->clipMin = (v))
+#define XAI_CNN_LSTM_GET_CLIP_MAX(x)                  ((x)->clipMax)
+#define XAI_CNN_LSTM_SET_CLIP_MAX(x, v)               ((x)->clipMax = (v))
+
+#define XAI_CNN_LSTM_GET_TIME_MAJOR_AXIS(x)           ((x)->timeMajorAxis)
+#define XAI_CNN_LSTM_SET_TIME_MAJOR_AXIS(x, v)        ((x)->timeMajorAxis = (v))
+#define XAI_CNN_LSTM_GET_DIRECTION(x)                 ((x)->direction)
+#define XAI_CNN_LSTM_SET_DIRECTION(x, v)              ((x)->direction = (v))
+
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+typedef struct
+{
+  xai_cnn_conv_params       fcInputParamIG;
+  xai_cnn_conv_params       fcInputParamFG;
+  xai_cnn_conv_params       fcInputParamOG;
+  xai_cnn_conv_params       fcInputParamMI;
+  xai_cnn_conv_params       fcHiddenParamIG;
+  xai_cnn_conv_params       fcHiddenParamFG;
+  xai_cnn_conv_params       fcHiddenParamOG;
+  xai_cnn_conv_params       fcHiddenParamMI;
+
+  xai_cnn_eltwiseMul_params eltMulParamHS1;
+  xai_cnn_eltwiseMul_params eltMulParamHS2;
+
+  xai_cnn_eltwiseMul_params eltMulParamCS;
+
+  xb_f16                    clipMinFP16;
+  xb_f16                    clipMaxFP16;
+
+  int32_t                   timeMajorAxis;
+  int32_t                   direction;
+} xai_lstm_F16_params;
+
+#define XAI_CNN_LSTM_F16_GET_FC_INPUT_IG_PARAM(x)         ((x)->fcInputParamIG)
+#define XAI_CNN_LSTM_F16_SET_FC_INPUT_IG_PARAM(x, v)      ((x)->fcInputParamIG = (v))
+#define XAI_CNN_LSTM_F16_GET_FC_INPUT_FG_PARAM(x)         ((x)->fcInputParamFG)
+#define XAI_CNN_LSTM_F16_SET_FC_INPUT_FG_PARAM(x, v)      ((x)->fcInputParamFG = (v))
+#define XAI_CNN_LSTM_F16_GET_FC_INPUT_OG_PARAM(x)         ((x)->fcInputParamOG)
+#define XAI_CNN_LSTM_F16_SET_FC_INPUT_OG_PARAM(x, v)      ((x)->fcInputParamOG = (v))
+#define XAI_CNN_LSTM_F16_GET_FC_INPUT_MI_PARAM(x)         ((x)->fcInputParamMI)
+#define XAI_CNN_LSTM_F16_SET_FC_INPUT_MI_PARAM(x, v)      ((x)->fcInputParamMI = (v))
+
+#define XAI_CNN_LSTM_F16_GET_FC_HIDDEN_IG_PARAM(x)        ((x)->fcHiddenParamIG)
+#define XAI_CNN_LSTM_F16_SET_FC_HIDDEN_IG_PARAM(x, v)     ((x)->fcHiddenParamIG = (v))
+#define XAI_CNN_LSTM_F16_GET_FC_HIDDEN_FG_PARAM(x)        ((x)->fcHiddenParamFG)
+#define XAI_CNN_LSTM_F16_SET_FC_HIDDEN_FG_PARAM(x, v)     ((x)->fcHiddenParamFG = (v))
+#define XAI_CNN_LSTM_F16_GET_FC_HIDDEN_OG_PARAM(x)        ((x)->fcHiddenParamOG)
+#define XAI_CNN_LSTM_F16_SET_FC_HIDDEN_OG_PARAM(x, v)     ((x)->fcHiddenParamOG = (v))
+#define XAI_CNN_LSTM_F16_GET_FC_HIDDEN_MI_PARAM(x)        ((x)->fcHiddenParamMI)
+#define XAI_CNN_LSTM_F16_SET_FC_HIDDEN_MI_PARAM(x, v)     ((x)->fcHiddenParamMI = (v))
+
+#define XAI_CNN_LSTM_F16_GET_ELTWISE_MUL_HS1_PARAM(x)     ((x)->eltMulParamHS1)
+#define XAI_CNN_LSTM_F16_SET_ELTWISE_MUL_HS1_PARAM(x, v)  ((x)->eltMulParamHS1 = (v))
+#define XAI_CNN_LSTM_F16_GET_ELTWISE_MUL_HS2_PARAM(x)     ((x)->eltMulParamHS2)
+#define XAI_CNN_LSTM_F16_SET_ELTWISE_MUL_HS2_PARAM(x, v)  ((x)->eltMulParamHS2 = (v))
+
+#define XAI_CNN_LSTM_F16_GET_ELTWISE_MUL_CS_PARAM(x)      ((x)->eltMulParamCS)
+#define XAI_CNN_LSTM_F16_SET_ELTWISE_MUL_CS_PARAM(x, v)   ((x)->eltMulParamCS = (v))
+
+#define XAI_CNN_LSTM_F16_GET_CLIP_MIN(x)                  ((x)->clipMinFP16)
+#define XAI_CNN_LSTM_F16_SET_CLIP_MIN(x, v)               ((x)->clipMinFP16 = (v))
+#define XAI_CNN_LSTM_F16_GET_CLIP_MAX(x)                  ((x)->clipMaxFP16)
+#define XAI_CNN_LSTM_F16_SET_CLIP_MAX(x, v)               ((x)->clipMaxFP16 = (v))
+
+#define XAI_CNN_LSTM_F16_GET_TIME_MAJOR_AXIS(x)           ((x)->timeMajorAxis)
+#define XAI_CNN_LSTM_F16_SET_TIME_MAJOR_AXIS(x, v)        ((x)->timeMajorAxis = (v))
+#define XAI_CNN_LSTM_F16_GET_DIRECTION(x)                 ((x)->direction)
+#define XAI_CNN_LSTM_F16_SET_DIRECTION(x, v)              ((x)->direction = (v))
+#endif
+
+typedef struct
+{
+  xai_cnn_conv_params        fcInputParamRG;
+  xai_cnn_conv_params        fcInputParamUG;
+  xai_cnn_conv_params        fcInputParamMS;
+  xai_cnn_conv_params        fcHiddenParamRG;
+  xai_cnn_conv_params        fcHiddenParamUG;
+  xai_cnn_conv_params        fcHiddenParamMS;
+
+  xai_cnn_sigmoid_params     sigmoidParamRG;
+  xai_cnn_sigmoid_params     sigmoidParamUG;
+  xai_cnn_tfl_eltwise_params eltMulParamMS;
+  xai_cnn_tanh_params        tanhParamMS;
+
+  xai_cnn_tfl_eltwise_params eltMulParamHS1;
+  xai_cnn_tfl_eltwise_params eltMulParamHS2;
+
+  int32_t                    eltAddOutOffsetHS; // NOTE: eltAddOutOffsetHS is not used in S16 variant. For S16 variant, set it to 0.
+  int32_t                    timeMajorAxis;
+  int32_t                    direction;
+} xai_gru_tfl_params;
+
+#define XAI_CNN_GRU_GET_FC_INPUT_RG_PARAM(x)             ((x)->fcInputParamRG)
+#define XAI_CNN_GRU_SET_FC_INPUT_RG_PARAM(x, v)          ((x)->fcInputParamRG = (v))
+#define XAI_CNN_GRU_GET_FC_INPUT_UG_PARAM(x)             ((x)->fcInputParamUG)
+#define XAI_CNN_GRU_SET_FC_INPUT_UG_PARAM(x, v)          ((x)->fcInputParamUG = (v))
+#define XAI_CNN_GRU_GET_FC_INPUT_MS_PARAM(x)             ((x)->fcInputParamMS)
+#define XAI_CNN_GRU_SET_FC_INPUT_MS_PARAM(x, v)          ((x)->fcInputParamMS = (v))
+
+#define XAI_CNN_GRU_GET_FC_HIDDEN_RG_PARAM(x)            ((x)->fcHiddenParamRG)
+#define XAI_CNN_GRU_SET_FC_HIDDEN_RG_PARAM(x, v)         ((x)->fcHiddenParamRG = (v))
+#define XAI_CNN_GRU_GET_FC_HIDDEN_UG_PARAM(x)            ((x)->fcHiddenParamUG)
+#define XAI_CNN_GRU_SET_FC_HIDDEN_UG_PARAM(x, v)         ((x)->fcHiddenParamUG = (v))
+#define XAI_CNN_GRU_GET_FC_HIDDEN_MS_PARAM(x)            ((x)->fcHiddenParamMS)
+#define XAI_CNN_GRU_SET_FC_HIDDEN_MS_PARAM(x, v)         ((x)->fcHiddenParamMS = (v))
+
+#define XAI_CNN_GRU_GET_SIGMOID_RG_PARAM(x)              ((x)->sigmoidParamRG)
+#define XAI_CNN_GRU_SET_SIGMOID_RG_PARAM(x, v)           ((x)->sigmoidParamRG = (v))
+#define XAI_CNN_GRU_GET_SIGMOID_UG_PARAM(x)              ((x)->sigmoidParamUG)
+#define XAI_CNN_GRU_SET_SIGMOID_UG_PARAM(x, v)           ((x)->sigmoidParamUG = (v))
+#define XAI_CNN_GRU_GET_ELTWISE_MUL_MS_PARAM(x)          ((x)->eltMulParamMS)
+#define XAI_CNN_GRU_SET_ELTWISE_MUL_MS_PARAM(x, v)       ((x)->eltMulParamMS = (v))
+#define XAI_CNN_GRU_GET_TANH_MS_PARAM(x)                 ((x)->tanhParamMS)
+#define XAI_CNN_GRU_SET_TANH_MS_PARAM(x, v)              ((x)->tanhParamMS = (v))
+
+#define XAI_CNN_GRU_GET_ELTWISE_MUL_HS1_PARAM(x)         ((x)->eltMulParamHS1)
+#define XAI_CNN_GRU_SET_ELTWISE_MUL_HS1_PARAM(x, v)      ((x)->eltMulParamHS1 = (v))
+#define XAI_CNN_GRU_GET_ELTWISE_MUL_HS2_PARAM(x)         ((x)->eltMulParamHS2)
+#define XAI_CNN_GRU_SET_ELTWISE_MUL_HS2_PARAM(x, v)      ((x)->eltMulParamHS2 = (v))
+
+#define XAI_CNN_GRU_GET_ELTWISE_ADD_HS_OUT_OFFSET(x)     ((x)->eltAddOutOffsetHS)
+#define XAI_CNN_GRU_SET_ELTWISE_ADD_HS_OUT_OFFSET(x, v)  ((x)->eltAddOutOffsetHS = (v))
+#define XAI_CNN_GRU_GET_TIME_MAJOR_AXIS(x)               ((x)->timeMajorAxis)
+#define XAI_CNN_GRU_SET_TIME_MAJOR_AXIS(x, v)            ((x)->timeMajorAxis = (v))
+#define XAI_CNN_GRU_GET_DIRECTION(x)                     ((x)->direction)
+#define XAI_CNN_GRU_SET_DIRECTION(x, v)                  ((x)->direction = (v))
+#endif // #ifndef __XAI_CNN_API_PARAMS_H__
diff --git a/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_common.h b/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_common.h
new file mode 100644
index 00000000000..34b5aec2008
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_common.h
@@ -0,0 +1,4329 @@
+/*
+ * Copyright (c) 2021 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#ifndef __XAI_CNN_COMMON_H__
+#define __XAI_CNN_COMMON_H__
+
+#include "xai_tile_manager.h"
+#include "xai_core.h"
+#include "xai_cnn_api_common.h"
+#include "limits.h"
+
+// frequently used macros for rounding and clamping
+#ifndef MAX2
+#define MAX2(a, b)  (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef MIN2
+#define MIN2(a, b)                        (((a) > (b)) ? (b) : (a))
+#endif
+#define CLAMP(v, min, max)                ((v) < (min) ? (min) : (v) > (max) ? (max) : (v))
+#define ROUND(x, s)                       (((s) == 0) ? (x) : (((x) + (1 << ((s) - 1))) >> (s)))
+#define ROUND_N_CLAMP(x, s, min, max)     (((s) == 0) ? (CLAMP(x, min, max)) : (CLAMP(ROUND(x, s), min, max)))
+#define ROUND64B(x, s)                    (((s) == 0) ? (x) : \
+                                           (((x) + ((int64_t) 1 << ((s) - 1))) >> (s)))
+#define ROUND_N_CLAMP64B(x, s, min, max)  (((s) == 0) ? (CLAMP(x, min, max)) : \
+                                           (CLAMP(ROUND64B(x, s), min, max)))
+#define ROI_CEIL(x, s)                    (((s) == 0) ? (x) : (((x) + (1 << ((s)))) >> (s)))
+
+#ifndef XCHAL_IVPN_SIMD_WIDTH
+#define XCHAL_IVPN_SIMD_WIDTH  32
+#endif
+
+/* Macros used for morphing various APIs */
+#define SIGNED8BIT                  1
+#define UNSIGNED8BIT                2
+#define SIGNED16BIT                 3
+#define UNSIGNED16BIT               4
+#define SIGNED32BIT                 5
+#define INTEGER8BIT                 6
+#define INTEGER16BIT                7
+#define FLOAT16BIT                  8
+#define FLOAT32BIT                  9
+#define SIGNED8BITUNSIGNED8BIT      10
+#define UNSIGNED8BITSIGNED8BIT      11
+#define SIGNED8BITSIGNED16BIT       12
+#define UNSIGNED8BITSIGNED16BIT     13
+#define SIGNED16BITSIGNED16BIT      14
+#define UNSIGNED32BIT               16
+#define INPUT16BITFLOAT             17
+#define INPUT8BIT                   18
+#define INPUT16BIT                  19
+#define INPUT32BIT                  20
+#define SIGNED64BIT                 21
+#define UNSIGNED64BIT               22
+
+#define QP_DEPTH_U8                 ((uint8_t) UCHAR_MAX)
+#define QP_DEPTH_U16                ((uint16_t) USHRT_MAX)
+#define QP_DEPTH_S16                ((int16_t) SHRT_MAX)
+#define QP_DEPTH_S8                 ((uint8_t) SCHAR_MAX)
+
+#define ADAPTIVE_AVG_POOL_Q_FORMAT  15
+
+#define CALC_NSA_32(input, count)                                           \
+    {                                                                       \
+      count = 0;                                                            \
+      int32_t mask  = 0x80000000;                                           \
+      int32_t index = 31;                                                   \
+      /*Determining the sign of the input*/                                 \
+      int32_t sign = (input & mask) >> index & 0x00000001;                  \
+      mask = 0x40000000;                                                    \
+      index--;                                                              \
+      /*Finding the count leading zeros incase of positive number           \
+         and count leading ones in case of negative number excluding        \
+         the sign bit*/                                                     \
+      while ((sign == ((input & mask) >> index)) && (mask != 0))            \
+      {                                                                     \
+        count += 1;                                                         \
+        mask   = mask >> 1;                                                 \
+        index--;                                                            \
+      }                                                                     \
+    }
+
+#define CONVERT_FP16_TO_FP32(F16Data)  (                       \
+    {                                                          \
+      int signBit, scaleSign, storedExponent;                  \
+      int trueExponent;                                        \
+      int significand, i;                                      \
+      float expVal, bitVal, temp, fractionFloat;               \
+      float implicitSignificand_val;                           \
+                                                               \
+      trueExponent = 0;                                        \
+      implicitSignificand_val = 0;                             \
+      float floatVal = 0;                                      \
+                                                               \
+      unsigned short F16Data_U16 = (unsigned short) F16Data;   \
+      int hex_val_fp16 = (int) F16Data_U16;                    \
+                                                               \
+      signBit = (hex_val_fp16 >> 15);                          \
+      scaleSign = ((signBit == 0) ? (1) : (-1));               \
+      storedExponent = ((hex_val_fp16 & 0x7fff) >> 10);        \
+      significand = (hex_val_fp16 & 0x03ff);                   \
+                                                               \
+      if (storedExponent == 31)                                \
+      {                                                        \
+        if (scaleSign == 1)                                    \
+        {                                                      \
+          if (significand == 0)                                \
+          {                                                    \
+            floatVal = +INFINITY;                              \
+            return (floatVal);                                 \
+          }                                                    \
+          else if (significand != 0)                           \
+          {                                                    \
+            floatVal = -NAN; /* +nan */                        \
+            return (floatVal);                                 \
+          }                                                    \
+        }                                                      \
+        else if (scaleSign == -1)                              \
+        {                                                      \
+          if (significand == 0)                                \
+          {                                                    \
+            floatVal = -INFINITY;                              \
+            return (floatVal);                                 \
+          }                                                    \
+          else if (significand != 0)                           \
+          {                                                    \
+            floatVal = NAN; /* -nan */                         \
+            return (floatVal);                                 \
+          }                                                    \
+        }                                                      \
+      }                                                        \
+      else if (storedExponent == 0)                            \
+      {                                                        \
+        trueExponent = -14;                                    \
+        implicitSignificand_val = 0.0f;                        \
+                                                               \
+        if (scaleSign == 1)                                    \
+        {                                                      \
+          if (significand == 0)                                \
+          {                                                    \
+            floatVal = 0;                                      \
+            return (floatVal);                                 \
+          }                                                    \
+        }                                                      \
+        else if (scaleSign == -1)                              \
+        {                                                      \
+          if (significand == 0)                                \
+          {                                                    \
+            floatVal = -0;                                     \
+            return (floatVal);                                 \
+          }                                                    \
+        }                                                      \
+      }                                                        \
+      else if ((storedExponent > 0) && (storedExponent < 31))  \
+      {                                                        \
+        trueExponent = storedExponent - 15;                    \
+        implicitSignificand_val = 1.0f;                        \
+      }                                                        \
+                                                               \
+      expVal = powf(2, (float) trueExponent);                  \
+                                                               \
+      fractionFloat = 0.0f;                                    \
+      for (i = 10; i > 0; i--)                                 \
+      {                                                        \
+        bitVal = (float) (significand & 0x1);                  \
+        temp = bitVal / (1 << i);                              \
+        fractionFloat = fractionFloat + temp;                  \
+                                                               \
+        significand = significand >> 1;                        \
+      }                                                        \
+      fractionFloat = fractionFloat + implicitSignificand_val; \
+                                                               \
+      scaleSign * expVal * fractionFloat;                      \
+    })
+
+#define XAI_CHECK_TILE3D_EDGE(tile, edge)                                                                    \
+  if (XAI_TILE3D_GET_DATA_ORDER(tile) == XAI_WHD)                                                            \
+  {                                                                                                          \
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1_EDGE1(tile) >= edge && XAI_TILE3D_GET_DIM1_EDGE2(tile) >= edge &&    \
+                    XAI_TILE3D_GET_DIM2_EDGE1(tile) >= edge && XAI_TILE3D_GET_DIM2_EDGE2(tile) >= edge,      \
+                    XAI_ERR_EDGE, "The (" #tile ") tile must have at least " #edge "-pixel edge extension"); \
+  }                                                                                                          \
+  else if (XAI_TILE3D_GET_DATA_ORDER(tile) == XAI_DWH)                                                       \
+  {                                                                                                          \
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_EDGE1(tile) >= edge && XAI_TILE3D_GET_DIM2_EDGE2(tile) >= edge &&    \
+                    XAI_TILE3D_GET_DIM3_EDGE1(tile) >= edge && XAI_TILE3D_GET_DIM3_EDGE2(tile) >= edge,      \
+                    XAI_ERR_EDGE, "The (" #tile ") tile must have at least " #edge "-pixel edge extension"); \
+  }                                                                                                          \
+
+#define XAI_CHECK_TILE3D_EDGE2(tile, edge1, edge2)                                                                   \
+  if (XAI_TILE3D_GET_DATA_ORDER(tile) == XAI_WHD)                                                                    \
+  {                                                                                                                  \
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1_EDGE1(tile) >= edge1 && XAI_TILE3D_GET_DIM1_EDGE2(tile) >= edge1 &&          \
+                    XAI_TILE3D_GET_DIM2_EDGE1(tile) >= edge2 && XAI_TILE3D_GET_DIM2_EDGE2(tile) >= edge2,            \
+                    XAI_ERR_EDGE, "The (" #tile ") tile must have at least " #edge1 #edge2 "-pixel edge extension"); \
+  }                                                                                                                  \
+  else if (XAI_TILE3D_GET_DATA_ORDER(tile) == XAI_DWH)                                                               \
+  {                                                                                                                  \
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_EDGE1(tile) >= edge1 && XAI_TILE3D_GET_DIM2_EDGE2(tile) >= edge1 &&          \
+                    XAI_TILE3D_GET_DIM3_EDGE1(tile) >= edge2 && XAI_TILE3D_GET_DIM3_EDGE2(tile) >= edge2,            \
+                    XAI_ERR_EDGE, "The (" #tile ") tile must have at least " #edge1 #edge2 "-pixel edge extension"); \
+  }
+
+#define XAI_CHECK_TILE3D_DATA_ORDER(tile, type) \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(tile) == type, XAI_ERR_BADARG, "The Data Order of (" #tile ") is not supported by this function")
+
+#define XAI_CHECK_TILE4D_DATA_ORDER(tile, type) \
+  XAI_CHECK_ERROR(XAI_TILE4D_GET_DATA_ORDER(tile) == type, XAI_ERR_BADARG, "The Data Order of (" #tile ") is not supported by this function")
+
+#define XAI_CHECK_KERNEL_SIZE(coeffT, size)                                                         \
+  if (XAI_TILE4D_GET_DATA_ORDER(coeffT) == XAI_WHDN)                                                \
+  {                                                                                                 \
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffT) == size) && (XAI_TILE4D_GET_DIM2(coeffT) == size), \
+                    XAI_ERR_KSIZE, "The Coefficient Kernel Size is not supported");                 \
+  }                                                                                                 \
+  else if (XAI_TILE4D_GET_DATA_ORDER(coeffT) == XAI_NDWH)                                           \
+  {                                                                                                 \
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffT) == size) && (XAI_TILE4D_GET_DIM4(coeffT) == size), \
+                    XAI_ERR_KSIZE, "The Coefficient Kernel Size is not supported");                 \
+  }
+
+#define XAI_CHECK_CONV_OUTPUT_TILE3D(outTile)                                                          \
+  XAI_CHECK_TILE3D(outTile);                                                                           \
+  XAI_CHECK_ERROR((XAI_TILE3D_CHECK_TYPE(outTile, XAI_U8)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8)) \
+                  || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)),                                        \
+                  XAI_ERR_DATATYPE, "The argument (" #outTile ") has wrong type");
+
+#define XAI_CHECK_CONV_I16_OUTPUT_TILE3D(outTile)                                                            \
+  XAI_CHECK_TILE3D(outTile);                                                                                 \
+  XAI_CHECK_ERROR((XAI_TILE3D_CHECK_TYPE(outTile, XAI_U8)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8))       \
+                  || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)), \
+                  XAI_ERR_DATATYPE, "The argument (" #outTile ") has wrong type");
+#define XAI_CHECK_CONV_OUTPUT_IX_TILE3D(outTile)                                                             \
+  XAI_CHECK_TILE3D(outTile);                                                                                 \
+  XAI_CHECK_ERROR((XAI_TILE3D_CHECK_TYPE(outTile, XAI_U8)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8))       \
+                  || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)), \
+                  XAI_ERR_DATATYPE, "The argument (" #outTile ") has wrong type");
+
+#define XAI_CHECK_CONV_OUTPUT_TILE4D(outTile)                                                          \
+  XAI_CHECK_TILE4D(outTile);                                                                           \
+  XAI_CHECK_ERROR((XAI_TILE4D_CHECK_TYPE(outTile, XAI_U8)) || (XAI_TILE4D_CHECK_TYPE(outTile, XAI_S8)) \
+                  || (XAI_TILE4D_CHECK_TYPE(outTile, XAI_S16)),                                        \
+                  XAI_ERR_DATATYPE, "The argument (" #outTile ") has wrong type");
+
+#define XAI_CHECK_STRIDE(param, stride) \
+  XAI_CHECK_ERROR(XAI_CNN_CONV_GET_STRIDE(param) == stride, XAI_ERR_BADARG, "The stride amount provided is not supported.");
+
+#define XAI_CHECK_DILATION(param, dilation) \
+  XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATION(param) == dilation, XAI_ERR_BADARG, "The dilation value provided is not supported.");
+
+
+#define XAI_CHECK_POOLING_STRIDE(param, stride) \
+  XAI_CHECK_ERROR(XAI_CNN_POOLING_GET_STRIDE(param) == stride, XAI_ERR_BADARG, "The stride amount provided is not supported.");
+
+#define XAI_CHECK_CONSISTENCY_MOD_DWH(inT, coeffT, biasArr, outT, param)                                                                       \
+  uint16_t dilatedKW_MOD = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1);                             \
+  uint16_t dilatedKH_MOD = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM4(coeffT) - 1) + 1);                             \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inT) == XAI_TILE4D_GET_DIM2(coeffT), XAI_ERR_DATASIZE,                                                   \
+                  "Number of Input Channels not equal to the number of channels in the Kernel.");                                              \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE4D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                                  \
+                  "Number of Output Channels not equal to the number of Kernels.");                                                            \
+  if (dilatedKW_MOD % 2 != 0)                                                                                                                  \
+  {                                                                                                                                            \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKW_MOD >> 1)                                           \
+                                                     + (dilatedKW_MOD >> 1) - dilatedKW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                       \
+  }                                                                                                                                            \
+  else                                                                                                                                         \
+  {                                                                                                                                            \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKW_MOD >> 1)                                           \
+                                                     + ((dilatedKW_MOD >> 1) - 1) - dilatedKW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                       \
+  }                                                                                                                                            \
+  if (dilatedKH_MOD % 2 != 0)                                                                                                                  \
+  {                                                                                                                                            \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedKH_MOD >> 1)                                           \
+                                                     + (dilatedKH_MOD >> 1) - dilatedKH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent..");                                                     \
+  }                                                                                                                                            \
+  else                                                                                                                                         \
+  {                                                                                                                                            \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedKH_MOD >> 1)                                           \
+                                                     + ((dilatedKH_MOD >> 1) - 1) - dilatedKH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent..");                                                     \
+  }                                                                                                                                            \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE4D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                               \
+                  "Width of Bias Array is less than number of Kernels.");                                                                      \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE,                                                                       \
+                  "Height of Bias Array should be greater than zero.");
+
+#define XAI_CHECK_CONSISTENCY_MOD_DWH_IN16DWH(inT, offsetArr, coeffT, biasArr, outT, param)                                                                  \
+  uint16_t dilatedKW_MOD = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM2(coeffT) - 1) + 1);                                           \
+  uint16_t dilatedKH_MOD = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1);                                           \
+  XAI_CHECK_ERROR((XAI_ALIGN_VAL(XAI_TILE3D_GET_DIM1(inT), 2 * XCHAL_IVPN_SIMD_WIDTH)) == (XAI_TILE4D_GET_DIM1(coeffT) >> 4), XAI_ERR_DATASIZE,              \
+                  "Number of Input Channels not equal to the number of channels in the Kernel.");                                                            \
+  XAI_CHECK_ERROR((XAI_ALIGN_VAL(XAI_TILE3D_GET_DIM1(outT), 2 * XCHAL_IVPN_SIMD_WIDTH)) == (XAI_TILE4D_GET_DIM4(coeffT) << 4), XAI_ERR_DATASIZE,             \
+                  "Number of Output Channels not equal to the number of Kernels.");                                                                          \
+  if (dilatedKW_MOD % 2 != 0)                                                                                                                                \
+  {                                                                                                                                                          \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKW_MOD >> 1)                                                         \
+                                                     + (dilatedKW_MOD >> 1) - dilatedKW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),                     \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                                     \
+  }                                                                                                                                                          \
+  else                                                                                                                                                       \
+  {                                                                                                                                                          \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKW_MOD >> 1)                                                         \
+                                                     + ((dilatedKW_MOD >> 1) - 1) - dilatedKW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),               \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                                     \
+  }                                                                                                                                                          \
+  if (dilatedKH_MOD % 2 != 0)                                                                                                                                \
+  {                                                                                                                                                          \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedKH_MOD >> 1)                                                         \
+                                                     + (dilatedKH_MOD >> 1) - dilatedKH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),                     \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent..");                                                                   \
+  }                                                                                                                                                          \
+  else                                                                                                                                                       \
+  {                                                                                                                                                          \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedKH_MOD >> 1)                                                         \
+                                                     + ((dilatedKH_MOD >> 1) - 1) - dilatedKH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),               \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent..");                                                                   \
+  }                                                                                                                                                          \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= (XAI_TILE3D_GET_DIM1(outT)), XAI_ERR_DATASIZE,                                                             \
+                  "Width of Bias Array is less than number of Kernels.");                                                                                    \
+  XAI_CHECK_ERROR((XAI_ARRAY_GET_WIDTH(offsetArr) >=                                                                                                         \
+                   (XAI_TILE4D_GET_DIM2(coeffT) * XAI_TILE4D_GET_DIM3(coeffT) * (XAI_ALIGN_VAL(XAI_TILE3D_GET_DIM1(inT), 2 * XCHAL_IVPN_SIMD_WIDTH) >> 4))), \
+                  XAI_ERR_DATASIZE, "Input offset Array size should be equal to kernelHeight * kernelWidth * (ALIGN(InputChannels,16)/16).");                \
+
+#define XAI_CHECK_CONSISTENCY_MOD_WHD_DWH(inT, coeffT, biasArr, outT, param)                                                                   \
+  uint16_t dilatedKW_MOD = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1);                             \
+  uint16_t dilatedKH_MOD = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM4(coeffT) - 1) + 1);                             \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inT) == XAI_TILE4D_GET_DIM2(coeffT), XAI_ERR_DATASIZE,                                                   \
+                  "Number of Input Channels not equal to the number of channels in the Kernel.");                                              \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE4D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                                  \
+                  "Number of Output Channels not equal to the number of Kernels.");                                                            \
+  if (dilatedKW_MOD % 2 != 0)                                                                                                                  \
+  {                                                                                                                                            \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (dilatedKW_MOD >> 1)                                           \
+                                                     + (dilatedKW_MOD >> 1) - dilatedKW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                       \
+  }                                                                                                                                            \
+  else                                                                                                                                         \
+  {                                                                                                                                            \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (dilatedKW_MOD >> 1)                                           \
+                                                     + ((dilatedKW_MOD >> 1) - 1) - dilatedKW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                       \
+  }                                                                                                                                            \
+  if (dilatedKH_MOD % 2 != 0)                                                                                                                  \
+  {                                                                                                                                            \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKH_MOD >> 1)                                           \
+                                                     + (dilatedKH_MOD >> 1) - dilatedKH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent..");                                                     \
+  }                                                                                                                                            \
+  else                                                                                                                                         \
+  {                                                                                                                                            \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKH_MOD >> 1)                                           \
+                                                     + ((dilatedKH_MOD >> 1) - 1) - dilatedKH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent..");                                                     \
+  }                                                                                                                                            \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE4D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                               \
+                  "Width of Bias Array is less than number of Kernels.");                                                                      \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE,                                                                       \
+                  "Height of Bias Array should be greater than zero.");
+
+#define XAI_CHECK_CONSISTENCY_MOW_WHD(inT, coeffT, biasArr, outT, param)                                                                      \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inT) == XAI_TILE4D_GET_DIM3(coeffT), XAI_ERR_DATASIZE,                                                  \
+                  "Number of Input Channels not equal to the number of channels in the Kernel.");                                             \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outT) == XAI_TILE4D_GET_DIM4(coeffT), XAI_ERR_DATASIZE,                                                 \
+                  "Number of Output Channels not equal to the number of Kernels.");                                                           \
+  uint16_t dilatedKW_MOW = (uint16_t) (XAI_CNN_CONV_GET_DILATION(param) * (XAI_TILE4D_GET_DIM1(coeffTile) - 1) + 1);                          \
+  uint16_t dilatedKH_MOW = (uint16_t) (XAI_CNN_CONV_GET_DILATION(param) * (XAI_TILE4D_GET_DIM2(coeffTile) - 1) + 1);                          \
+  if (dilatedKW_MOW % 2 != 0)                                                                                                                 \
+  {                                                                                                                                           \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (dilatedKW_MOW >> 1)                                          \
+                                                     + (dilatedKW_MOW >> 1) - dilatedKW_MOW) / (XAI_CNN_CONV_GET_STRIDE(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                      \
+  }                                                                                                                                           \
+  else                                                                                                                                        \
+  {                                                                                                                                           \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (dilatedKW_MOW >> 1)                                          \
+                                                     + ((dilatedKW_MOW >> 1) - 1) - dilatedKW_MOW) / (XAI_CNN_CONV_GET_STRIDE(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                      \
+  }                                                                                                                                           \
+  if (dilatedKH_MOW % 2 != 0)                                                                                                                 \
+  {                                                                                                                                           \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKH_MOW >> 1)                                          \
+                                                     + (dilatedKH_MOW >> 1) - dilatedKH_MOW) / (XAI_CNN_CONV_GET_STRIDE(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent..");                                                    \
+  }                                                                                                                                           \
+  else                                                                                                                                        \
+  {                                                                                                                                           \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKH_MOW >> 1)                                          \
+                                                     + ((dilatedKH_MOW >> 1) - 1) - dilatedKH_MOW) / (XAI_CNN_CONV_GET_STRIDE(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent..");                                                    \
+  }                                                                                                                                           \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM3(outT), XAI_ERR_DATASIZE,                                                \
+                  "Width of Bias Array is less than number of Kernels.");                                                                     \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE,                                                                      \
+                  "Height of Bias Array should be greater than zero.");
+
+/* outT is assumed to be ID16WH */
+/* inT is assumed to be DWH */
+/* coeffT is assumed to be RMOD_DWH_ID16WH */
+#if (XCHAL_IVPN_SIMD_WIDTH == 64)
+#define XAI_CHECK_CONSISTENCY_MOD_DWH_ID16WH(inT, coeffT, biasArr, outT, param)                                                       \
+  {                                                                                                                                   \
+    uint16_t dilationX     = XAI_CNN_CONV_GET_DILATIONX(param);                                                                       \
+    uint16_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);                                                                       \
+    int32_t dilatedkWidth  = dilationX * (XAI_TILE4D_GET_DIM2(coeffT) - 1) + 1;                                                       \
+    int32_t dilatedkHeight = dilationY * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1;                                                       \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) << 4) == (XAI_TILE4D_GET_DIM4(coeffT) << 4), XAI_ERR_DATASIZE,                         \
+                    "Number of Output Channels not equal to the number of Kernels.");                                                 \
+    if (dilatedkWidth % 2 != 0)                                                                                                       \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 4) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedkWidth >> 1) + (dilatedkWidth >> 1)  \
+                                                              - dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),            \
+                      XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                            \
+    }                                                                                                                                 \
+    else                                                                                                                              \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 4) <= (((XAI_TILE3D_GET_DIM2(inT) +                                              \
+                                                              (dilatedkWidth >> 1) + ((dilatedkWidth >> 1) - 1) -                     \
+                                                              dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),              \
+                      XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                            \
+    }                                                                                                                                 \
+    if (dilatedkHeight % 2 != 0)                                                                                                      \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + (dilatedkHeight >> 1)       \
+                                                       - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),                  \
+                      XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                           \
+    }                                                                                                                                 \
+    else                                                                                                                              \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + ((dilatedkHeight >> 1) - 1) \
+                                                       - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),                  \
+                      XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                           \
+    }                                                                                                                                 \
+    if (XAI_TILE4D_GET_DATA_ORDER(coeffT) == XAI_RMOD_DWH_I16_ID16WH)                                                                 \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM1(coeffT) >> 4) == 4), XAI_ERR_DATASIZE,                                                    \
+                      "Number of Input Channels in the kernel after zero padding (if any) should be 4.");                             \
+    }                                                                                                                                 \
+    else if (XAI_TILE4D_GET_DATA_ORDER(coeffT) == XAI_RMOD_DWH_ID16WH)                                                                \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM1(coeffT) >> 5) == 4), XAI_ERR_DATASIZE,                                                    \
+                      "Number of Input Channels in the kernel after zero padding (if any) should be 4.");                             \
+    }                                                                                                                                 \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(inT) <= 4), XAI_ERR_DATASIZE,                                                                \
+                    "Number of Input Channels should be less than equal to 4.");                                                      \
+    XAI_CHECK_ERROR((XAI_ALIGN_VAL(XAI_ARRAY_GET_WIDTH(biasArr), 16) == (XAI_TILE3D_GET_DIM2(outT) << 4)), XAI_ERR_DATASIZE,          \
+                    "Width of Bias Array is less than or equal to the number of output channels.");                                   \
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE,                                                            \
+                    "Height of Bias Array should be greater than zero.");                                                             \
+  }
+
+#else
+#define XAI_CHECK_CONSISTENCY_MOD_DWH_ID16WH(inT, coeffT, biasArr, outT, param)                                                       \
+  {                                                                                                                                   \
+    uint16_t dilationX     = XAI_CNN_CONV_GET_DILATIONX(param);                                                                       \
+    uint16_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);                                                                       \
+    int32_t dilatedkWidth  = dilationX * (XAI_TILE4D_GET_DIM2(coeffT) - 1) + 1;                                                       \
+    int32_t dilatedkHeight = dilationY * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1;                                                       \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) << 4) == (XAI_TILE4D_GET_DIM4(coeffT) << 4), XAI_ERR_DATASIZE,                         \
+                    "Number of Output Channels not equal to the number of Kernels.");                                                 \
+    if (dilatedkWidth % 2 != 0)                                                                                                       \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 4) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedkWidth >> 1) + (dilatedkWidth >> 1)  \
+                                                              - dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),            \
+                      XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                            \
+    }                                                                                                                                 \
+    else                                                                                                                              \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 4) <= (((XAI_TILE3D_GET_DIM2(inT) +                                              \
+                                                              (dilatedkWidth >> 1) + ((dilatedkWidth >> 1) - 1) -                     \
+                                                              dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),              \
+                      XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                            \
+    }                                                                                                                                 \
+    if (dilatedkHeight % 2 != 0)                                                                                                      \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + (dilatedkHeight >> 1)       \
+                                                       - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),                  \
+                      XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                           \
+    }                                                                                                                                 \
+    else                                                                                                                              \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + ((dilatedkHeight >> 1) - 1) \
+                                                       - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),                  \
+                      XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                           \
+    }                                                                                                                                 \
+    XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM1(coeffT) >> 4) == 4), XAI_ERR_DATASIZE,                                                      \
+                    "Number of Input Channels in the kernel after zero padding (if any) should be 4.");                               \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(inT) <= 4), XAI_ERR_DATASIZE,                                                                \
+                    "Number of Input Channels should be less than equal to 4.");                                                      \
+    XAI_CHECK_ERROR((XAI_ALIGN_VAL(XAI_ARRAY_GET_WIDTH(biasArr), 16) == (XAI_TILE3D_GET_DIM2(outT) << 4)), XAI_ERR_DATASIZE,          \
+                    "Width of Bias Array is less than or equal to the number of output channels.");                                   \
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE,                                                            \
+                    "Height of Bias Array should be greater than zero.");                                                             \
+  }
+#endif
+
+#define XAI_CHECK_CONSISTENCY_DEPTHWISE_DILATED_MOW_WHD(inT, coeffT, biasArr, outT, param)                                                                \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inT) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DEPTH_MULTIPLIER(param) ==                                                \
+                  XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE,                                                                                          \
+                  "Number of Input Channels not equal to the number of channels in the Kernel.");                                                         \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outT) == XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE,                                                             \
+                  "Number of Output Channels not equal to the number of Kernels.");                                                                       \
+  uint16_t dilatedKW_MOW = (uint16_t) (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param) *                                                               \
+                                       (XAI_TILE3D_GET_DIM1(coeffTile) - 1) + 1);                                                                         \
+  uint16_t dilatedKH_MOW = (uint16_t) (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param) *                                                               \
+                                       (XAI_TILE3D_GET_DIM2(coeffTile) - 1) + 1);                                                                         \
+  if (dilatedKW_MOW % 2 != 0)                                                                                                                             \
+  {                                                                                                                                                       \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (dilatedKW_MOW >> 1)                                                      \
+                                                     + (dilatedKW_MOW >> 1) - dilatedKW_MOW) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                                  \
+  }                                                                                                                                                       \
+  else                                                                                                                                                    \
+  {                                                                                                                                                       \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (dilatedKW_MOW >> 1)                                                      \
+                                                     + ((dilatedKW_MOW >> 1) - 1) - dilatedKW_MOW) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param)))  \
+                                                   + 1)), XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                            \
+  }                                                                                                                                                       \
+  if (dilatedKH_MOW % 2 != 0)                                                                                                                             \
+  {                                                                                                                                                       \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKH_MOW >> 1)                                                      \
+                                                     + (dilatedKH_MOW >> 1) - dilatedKH_MOW) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                                 \
+  }                                                                                                                                                       \
+  else                                                                                                                                                    \
+  {                                                                                                                                                       \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKH_MOW >> 1)                                                      \
+                                                     + ((dilatedKH_MOW >> 1) - 1) - dilatedKH_MOW) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param)))  \
+                                                   + 1)), XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                           \
+  }                                                                                                                                                       \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM3(outT), XAI_ERR_DATASIZE,                                                            \
+                  "Width of Bias Array is less than number of Kernels.");                                                                                 \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE,                                                                                  \
+                  "Height of Bias Array should be greater than zero.");
+
+#define XAI_CHECK_CONSISTENCY_SO_DWH(inT, coeffT, biasArr, outT, param)                                                                      \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inT) == XAI_TILE4D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                                 \
+                  "Number of Input Channels not equal to the number of channels in the Kernel.");                                            \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE4D_GET_DIM4(coeffT), XAI_ERR_DATASIZE,                                                \
+                  "Number of Output Channels not equal to the number of Kernels.");                                                          \
+  uint16_t dilatedKW_SO = (uint16_t) (XAI_CNN_CONV_GET_DILATION(param) * (XAI_TILE4D_GET_DIM2(coeffTile) - 1) + 1);                          \
+  uint16_t dilatedKH_SO = (uint16_t) (XAI_CNN_CONV_GET_DILATION(param) * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1);                          \
+  if (dilatedKW_SO % 2 != 0)                                                                                                                 \
+  {                                                                                                                                          \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKW_SO >> 1)                                          \
+                                                     + (dilatedKW_SO >> 1) - dilatedKW_SO) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                     \
+  }                                                                                                                                          \
+  else                                                                                                                                       \
+  {                                                                                                                                          \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKW_SO >> 1)                                          \
+                                                     + ((dilatedKW_SO >> 1) - 1) - dilatedKW_SO) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                     \
+  }                                                                                                                                          \
+  if (dilatedKH_SO % 2 != 0)                                                                                                                 \
+  {                                                                                                                                          \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedKH_SO >> 1)                                          \
+                                                     + (dilatedKH_SO >> 1) - dilatedKH_SO) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                    \
+  }                                                                                                                                          \
+  else                                                                                                                                       \
+  {                                                                                                                                          \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedKH_SO >> 1)                                          \
+                                                     + ((dilatedKH_SO >> 1) - 1) - dilatedKH_SO) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                    \
+  }                                                                                                                                          \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE4D_GET_DIM4(coeffT), XAI_ERR_DATASIZE,                                             \
+                  "Width of Bias Array is less than number of Kernels.");                                                                    \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE,                                                                     \
+                  "Height of Bias Array should be greater than zero.");
+
+#define XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffT, param)                                   \
+  XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1_PITCH(coeffT) == XAI_TILE4D_GET_DIM1(coeffT)) && \
+                  (XAI_TILE4D_GET_DIM2_PITCH(coeffT) == XAI_TILE4D_GET_DIM1(coeffT) *   \
+                   XAI_TILE4D_GET_DIM2(coeffT)), XAI_ERR_BADARG,                        \
+                  "CoeffTile is not contiguous.");
+
+#define XAI_CHECK_CONSISTENCY_POOL_WHD(inT, outT, param)                                                                                                                                        \
+  XAI_CHECK_ERROR((XAI_CNN_POOLING_GET_STRIDEX(param) > 0) && (XAI_CNN_POOLING_GET_STRIDEY(param) > 0),                                                                                         \
+                  XAI_ERR_BADARG, "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height must be greater than 0",                                                                     \
+                  XAI_CNN_POOLING_GET_STRIDEX(param), XAI_CNN_POOLING_GET_STRIDEY(param));                                                                                                      \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inT) == XAI_TILE3D_GET_DIM3(outT),                                                                                                                        \
+                  XAI_ERR_CHANNEL_INVALID, "Number of input and output channels don't match");                                                                                                  \
+  if (XAI_CNN_POOLING_GET_KERNELWIDTH(param) % 2 != 0)                                                                                                                                          \
+  {                                                                                                                                                                                             \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) &&                                                                                          \
+                     XAI_TILE3D_GET_DIM1_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)),                                                                                           \
+                    XAI_ERR_EDGE, "Invalid edge for odd kernel size");                                                                                                                          \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1)                                                                   \
+                                                     + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - XAI_CNN_POOLING_GET_KERNELWIDTH(param)) / (XAI_CNN_POOLING_GET_STRIDEX(param))) + 1)),   \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                                                                        \
+  }                                                                                                                                                                                             \
+  else                                                                                                                                                                                          \
+  {                                                                                                                                                                                             \
+    if (XAI_CNN_POOLING_GET_LEFTEDGE_FLAG(param))                                                                                                                                               \
+    {                                                                                                                                                                                           \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) &&                                                                                        \
+                       XAI_TILE3D_GET_DIM1_EDGE2(inT) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1)),                                                                                   \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag set");                                                                                               \
+    }                                                                                                                                                                                           \
+    else                                                                                                                                                                                        \
+    {                                                                                                                                                                                           \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inT) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1) &&                                                                                  \
+                       XAI_TILE3D_GET_DIM1_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)),                                                                                         \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag reset");                                                                                             \
+    }                                                                                                                                                                                           \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - 1)                                                             \
+                                                     + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - XAI_CNN_POOLING_GET_KERNELWIDTH(param)) / (XAI_CNN_POOLING_GET_STRIDEX(param))) + 1)),   \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                                                                        \
+  }                                                                                                                                                                                             \
+  if (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) % 2 != 0)                                                                                                                                         \
+  {                                                                                                                                                                                             \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) &&                                                                                         \
+                     XAI_TILE3D_GET_DIM2_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)),                                                                                          \
+                    XAI_ERR_EDGE, "Invalid edge for odd kernel size");                                                                                                                          \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1)                                                                  \
+                                                     + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - XAI_CNN_POOLING_GET_KERNELHEIGHT(param)) / (XAI_CNN_POOLING_GET_STRIDEY(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                                                                       \
+  }                                                                                                                                                                                             \
+  else                                                                                                                                                                                          \
+  {                                                                                                                                                                                             \
+    if (XAI_CNN_POOLING_GET_TOPEDGE_FLAG(param))                                                                                                                                                \
+    {                                                                                                                                                                                           \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) &&                                                                                       \
+                       XAI_TILE3D_GET_DIM2_EDGE2(inT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1)),                                                                                  \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag set");                                                                                                \
+    }                                                                                                                                                                                           \
+    else                                                                                                                                                                                        \
+    {                                                                                                                                                                                           \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1) &&                                                                                 \
+                       XAI_TILE3D_GET_DIM2_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)),                                                                                        \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag reset");                                                                                              \
+    }                                                                                                                                                                                           \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - 1)                                                            \
+                                                     + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - XAI_CNN_POOLING_GET_KERNELHEIGHT(param)) / (XAI_CNN_POOLING_GET_STRIDEY(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                                                                       \
+  }
+
+#define XAI_CHECK_CONSISTENCY_POOL_DWH(inT, outT, param)                                                                                                                                        \
+  XAI_CHECK_ERROR((XAI_CNN_POOLING_GET_STRIDEX(param) > 0) && (XAI_CNN_POOLING_GET_STRIDEY(param) > 0),                                                                                         \
+                  XAI_ERR_BADARG, "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height must be greater than 0",                                                                     \
+                  XAI_CNN_POOLING_GET_STRIDEX(param), XAI_CNN_POOLING_GET_STRIDEY(param));                                                                                                      \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inT) == XAI_TILE3D_GET_DIM1(outT),                                                                                                                        \
+                  XAI_ERR_CHANNEL_INVALID, "Number of input and output channels don't match");                                                                                                  \
+  if ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) % 2 != 0))                                                                                                                                        \
+  {                                                                                                                                                                                             \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) &&                                                                                          \
+                     XAI_TILE3D_GET_DIM2_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)),                                                                                           \
+                    XAI_ERR_EDGE, "Invalid edge for odd kernel size");                                                                                                                          \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1)                                                                   \
+                                                     + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - XAI_CNN_POOLING_GET_KERNELWIDTH(param)) / (XAI_CNN_POOLING_GET_STRIDEX(param))) + 1)),   \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                                                                        \
+  }                                                                                                                                                                                             \
+  else                                                                                                                                                                                          \
+  {                                                                                                                                                                                             \
+    if (XAI_CNN_POOLING_GET_LEFTEDGE_FLAG(param))                                                                                                                                               \
+    {                                                                                                                                                                                           \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) &&                                                                                        \
+                       XAI_TILE3D_GET_DIM2_EDGE2(inT) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1)),                                                                                   \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag set");                                                                                               \
+    }                                                                                                                                                                                           \
+    else                                                                                                                                                                                        \
+    {                                                                                                                                                                                           \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inT) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1) &&                                                                                  \
+                       XAI_TILE3D_GET_DIM2_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)),                                                                                         \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag reset");                                                                                             \
+    }                                                                                                                                                                                           \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - 1)                                                             \
+                                                     + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - XAI_CNN_POOLING_GET_KERNELWIDTH(param)) / (XAI_CNN_POOLING_GET_STRIDEX(param))) + 1)),   \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                                                                        \
+  }                                                                                                                                                                                             \
+  if ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) % 2 != 0))                                                                                                                                       \
+  {                                                                                                                                                                                             \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) &&                                                                                         \
+                     XAI_TILE3D_GET_DIM3_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)),                                                                                          \
+                    XAI_ERR_EDGE, "Invalid edge for odd kernel size");                                                                                                                          \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1)                                                                  \
+                                                     + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - XAI_CNN_POOLING_GET_KERNELHEIGHT(param)) / (XAI_CNN_POOLING_GET_STRIDEY(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                                                                       \
+  }                                                                                                                                                                                             \
+  else                                                                                                                                                                                          \
+  {                                                                                                                                                                                             \
+    if (XAI_CNN_POOLING_GET_TOPEDGE_FLAG(param))                                                                                                                                                \
+    {                                                                                                                                                                                           \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) &&                                                                                       \
+                       XAI_TILE3D_GET_DIM3_EDGE2(inT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1)),                                                                                  \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag set");                                                                                                \
+    }                                                                                                                                                                                           \
+    else                                                                                                                                                                                        \
+    {                                                                                                                                                                                           \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1) &&                                                                                 \
+                       XAI_TILE3D_GET_DIM3_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)),                                                                                        \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag reset");                                                                                              \
+    }                                                                                                                                                                                           \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - 1)                                                            \
+                                                     + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - XAI_CNN_POOLING_GET_KERNELHEIGHT(param)) / (XAI_CNN_POOLING_GET_STRIDEY(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                                                                       \
+  }
+
+#define XAI_CHECK_CONSISTENCY_POOL_ID32WH(inT, outT, param)                                                                                                                                          \
+  XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(inT) << 5) == (XAI_TILE3D_GET_DIM2(outT) << 5),                                                                                                               \
+                  XAI_ERR_CHANNEL_INVALID, "Number of input and output channels don't match");                                                                                                       \
+  if ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) % 2 != 0))                                                                                                                                             \
+  {                                                                                                                                                                                                  \
+    XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inT) >> 5) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) &&                                                                                        \
+                     (XAI_TILE3D_GET_DIM1_EDGE2(inT) >> 5) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)),                                                                                         \
+                    XAI_ERR_EDGE, "Invalid edge for odd kernel size");                                                                                                                               \
+    XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 5) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 5) + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1)                                                          \
+                                                            + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - XAI_CNN_POOLING_GET_KERNELWIDTH(param)) / (XAI_CNN_POOLING_GET_STRIDEX(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                                                                             \
+  }                                                                                                                                                                                                  \
+  else                                                                                                                                                                                               \
+  {                                                                                                                                                                                                  \
+    if (XAI_CNN_POOLING_GET_LEFTEDGE_FLAG(param))                                                                                                                                                    \
+    {                                                                                                                                                                                                \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inT) >> 5) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) &&                                                                                      \
+                       (XAI_TILE3D_GET_DIM1_EDGE2(inT) >> 5) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1)),                                                                                 \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag set");                                                                                                    \
+    }                                                                                                                                                                                                \
+    else                                                                                                                                                                                             \
+    {                                                                                                                                                                                                \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inT) >> 5) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1) &&                                                                                \
+                       (XAI_TILE3D_GET_DIM1_EDGE2(inT) >> 5) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)),                                                                                       \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag reset");                                                                                                  \
+    }                                                                                                                                                                                                \
+    XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 5) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 5) + ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - 1)                                                    \
+                                                            + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - XAI_CNN_POOLING_GET_KERNELWIDTH(param)) / (XAI_CNN_POOLING_GET_STRIDEX(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                                                                             \
+  }                                                                                                                                                                                                  \
+  if ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) % 2 != 0))                                                                                                                                            \
+  {                                                                                                                                                                                                  \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) &&                                                                                              \
+                     XAI_TILE3D_GET_DIM3_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)),                                                                                               \
+                    XAI_ERR_EDGE, "Invalid edge for odd kernel size");                                                                                                                               \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1)                                                                       \
+                                                     + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - XAI_CNN_POOLING_GET_KERNELHEIGHT(param)) / (XAI_CNN_POOLING_GET_STRIDEY(param))) + 1)),      \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                                                                            \
+  }                                                                                                                                                                                                  \
+  else                                                                                                                                                                                               \
+  {                                                                                                                                                                                                  \
+    if (XAI_CNN_POOLING_GET_TOPEDGE_FLAG(param))                                                                                                                                                     \
+    {                                                                                                                                                                                                \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) &&                                                                                            \
+                       XAI_TILE3D_GET_DIM3_EDGE2(inT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1)),                                                                                       \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag set");                                                                                                     \
+    }                                                                                                                                                                                                \
+    else                                                                                                                                                                                             \
+    {                                                                                                                                                                                                \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1) &&                                                                                      \
+                       XAI_TILE3D_GET_DIM3_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)),                                                                                             \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag reset");                                                                                                   \
+    }                                                                                                                                                                                                \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - 1)                                                                 \
+                                                     + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - XAI_CNN_POOLING_GET_KERNELHEIGHT(param)) / (XAI_CNN_POOLING_GET_STRIDEY(param))) + 1)),      \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                                                                            \
+  }
+#define XAI_CHECK_CONSISTENCY_POOL_ID16WH(inT, outT, param)                                                                                                                                          \
+  XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(inT) << 4) == (XAI_TILE3D_GET_DIM2(outT) << 4),                                                                                                               \
+                  XAI_ERR_CHANNEL_INVALID, "Number of input and output channels don't match");                                                                                                       \
+  if ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) % 2 != 0))                                                                                                                                             \
+  {                                                                                                                                                                                                  \
+    XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inT) >> 4) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) &&                                                                                        \
+                     (XAI_TILE3D_GET_DIM1_EDGE2(inT) >> 4) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)),                                                                                         \
+                    XAI_ERR_EDGE, "Invalid edge for odd kernel size");                                                                                                                               \
+    XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 4) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 4) + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1)                                                          \
+                                                            + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - XAI_CNN_POOLING_GET_KERNELWIDTH(param)) / (XAI_CNN_POOLING_GET_STRIDEX(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                                                                             \
+  }                                                                                                                                                                                                  \
+  else                                                                                                                                                                                               \
+  {                                                                                                                                                                                                  \
+    if (XAI_CNN_POOLING_GET_LEFTEDGE_FLAG(param))                                                                                                                                                    \
+    {                                                                                                                                                                                                \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inT) >> 4) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) &&                                                                                      \
+                       (XAI_TILE3D_GET_DIM1_EDGE2(inT) >> 4) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1)),                                                                                 \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag set");                                                                                                    \
+    }                                                                                                                                                                                                \
+    else                                                                                                                                                                                             \
+    {                                                                                                                                                                                                \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inT) >> 4) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1) &&                                                                                \
+                       (XAI_TILE3D_GET_DIM1_EDGE2(inT) >> 4) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)),                                                                                       \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag reset");                                                                                                  \
+    }                                                                                                                                                                                                \
+    XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 4) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 4) + ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - 1)                                                    \
+                                                            + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - XAI_CNN_POOLING_GET_KERNELWIDTH(param)) / (XAI_CNN_POOLING_GET_STRIDEX(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                                                                             \
+  }                                                                                                                                                                                                  \
+  if ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) % 2 != 0))                                                                                                                                            \
+  {                                                                                                                                                                                                  \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) &&                                                                                              \
+                     XAI_TILE3D_GET_DIM3_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)),                                                                                               \
+                    XAI_ERR_EDGE, "Invalid edge for odd kernel size");                                                                                                                               \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1)                                                                       \
+                                                     + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - XAI_CNN_POOLING_GET_KERNELHEIGHT(param)) / (XAI_CNN_POOLING_GET_STRIDEY(param))) + 1)),      \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                                                                            \
+  }                                                                                                                                                                                                  \
+  else                                                                                                                                                                                               \
+  {                                                                                                                                                                                                  \
+    if (XAI_CNN_POOLING_GET_TOPEDGE_FLAG(param))                                                                                                                                                     \
+    {                                                                                                                                                                                                \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) &&                                                                                            \
+                       XAI_TILE3D_GET_DIM3_EDGE2(inT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1)),                                                                                       \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag set");                                                                                                     \
+    }                                                                                                                                                                                                \
+    else                                                                                                                                                                                             \
+    {                                                                                                                                                                                                \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1) &&                                                                                      \
+                       XAI_TILE3D_GET_DIM3_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)),                                                                                             \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag reset");                                                                                                   \
+    }                                                                                                                                                                                                \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - 1)                                                                 \
+                                                     + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - XAI_CNN_POOLING_GET_KERNELHEIGHT(param)) / (XAI_CNN_POOLING_GET_STRIDEY(param))) + 1)),      \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                                                                            \
+  }
+#define XAI_CHECK_CONSISTENCY_UNPOOL_WHD(inT, outT, param)                                                       \
+  /* Width & Height Divisible by stride */                                                                       \
+  XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1(outT) - 1) % XAI_CNN_POOLING_GET_STRIDEX(param)) == 0),                 \
+                  XAI_ERR_DATASIZE, "Number of output widths to be generated should be a multiple of strideX");  \
+  XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM2(outT) - 1) % XAI_CNN_POOLING_GET_STRIDEY(param)) == 0),                 \
+                  XAI_ERR_DATASIZE, "Number of output heights to be generated should be a multiple of strideY"); \
+                                                                                                                 \
+  /* Depth Should be same for in and out tiles */                                                                \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inT) == XAI_TILE3D_GET_DIM3(outT),                                         \
+                  XAI_ERR_CHANNEL_INVALID, "Number of input and output channels don't match");                   \
+                                                                                                                 \
+  /* Minimum required input width to compute requested output width */                                           \
+  XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(inT) >= ((XAI_TILE3D_GET_DIM1(outT) - 1) /                                \
+                                                XAI_CNN_POOLING_GET_STRIDEX(param)) + 1), XAI_ERR_DATASIZE,      \
+                  "Insufficient input width to generate requested output width");                                \
+                                                                                                                 \
+  /* Minimum required input height to compute requested output height */                                         \
+  XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(inT) >= ((XAI_TILE3D_GET_DIM2(outT) - 1) /                                \
+                                                XAI_CNN_POOLING_GET_STRIDEY(param)) + 1), XAI_ERR_DATASIZE,      \
+                  "Insufficient input height to generate requested output height");                              \
+                                                                                                                 \
+  if (XAI_CNN_POOLING_GET_KERNELWIDTH(param) % 2 != 0)                                                           \
+  {                                                                                                              \
+    /* Odd Width Kernel Edge Consistency */                                                                      \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(outT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) &&          \
+                     XAI_TILE3D_GET_DIM1_EDGE2(outT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)),           \
+                    XAI_ERR_EDGE, "Invalid left/right edge for odd kernel width.");                              \
+  }                                                                                                              \
+  else                                                                                                           \
+  {                                                                                                              \
+    /* Even Width Kernel Edge Consistency */                                                                     \
+    if (XAI_CNN_POOLING_GET_LEFTEDGE_FLAG(param))                                                                \
+    {                                                                                                            \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(outT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) &&        \
+                       XAI_TILE3D_GET_DIM1_EDGE2(outT) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1)),   \
+                      XAI_ERR_EDGE, "Invalid left/right edge for even kernel width with leftedge flag set");     \
+    }                                                                                                            \
+    else                                                                                                         \
+    {                                                                                                            \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(outT) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1) &&  \
+                       XAI_TILE3D_GET_DIM1_EDGE2(outT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)),         \
+                      XAI_ERR_EDGE, "Invalid left/right edge for even kernel width with leftedge flag reset");   \
+    }                                                                                                            \
+  }                                                                                                              \
+  if (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) % 2 != 0)                                                          \
+  {                                                                                                              \
+    /* Odd Height Kernel Edge Consistency */                                                                     \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(outT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) &&         \
+                     XAI_TILE3D_GET_DIM2_EDGE2(outT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)),          \
+                    XAI_ERR_EDGE, "Invalid Top/Bottom edge for odd kernel height.");                             \
+  }                                                                                                              \
+  else                                                                                                           \
+  {                                                                                                              \
+    /* Even Height Kernel Edge Consistency */                                                                    \
+    if (XAI_CNN_POOLING_GET_TOPEDGE_FLAG(param))                                                                 \
+    {                                                                                                            \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(outT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) &&       \
+                       XAI_TILE3D_GET_DIM2_EDGE2(outT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1)),  \
+                      XAI_ERR_EDGE, "Invalid top/bottom edge for even kernel height with topedge flag set");     \
+    }                                                                                                            \
+    else                                                                                                         \
+    {                                                                                                            \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(outT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1) && \
+                       XAI_TILE3D_GET_DIM2_EDGE2(outT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)),        \
+                      XAI_ERR_EDGE, "Invalid top/bottom edge for even kernel height with topedge flag reset");   \
+    }                                                                                                            \
+  }
+
+#define XAI_CHECK_CONSISTENCY_UNPOOL_DWH(inT, outT, param)                                                       \
+  /* Width & Height Divisible by stride */                                                                       \
+  XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM2(outT) - 1) % XAI_CNN_POOLING_GET_STRIDEX(param)) == 0),                 \
+                  XAI_ERR_DATASIZE, "Number of output widths to be generated should be a multiple of strideX");  \
+  XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM3(outT) - 1) % XAI_CNN_POOLING_GET_STRIDEY(param)) == 0),                 \
+                  XAI_ERR_DATASIZE, "Number of output heights to be generated should be a multiple of strideY"); \
+                                                                                                                 \
+  /* Depth Should be same for in and out tiles */                                                                \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inT) == XAI_TILE3D_GET_DIM1(outT),                                         \
+                  XAI_ERR_CHANNEL_INVALID, "Number of input and output channels don't match");                   \
+                                                                                                                 \
+  /* Minimum required input width to compute requested output width */                                           \
+  XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(inT) >= ((XAI_TILE3D_GET_DIM2(outT) - 1) /                                \
+                                                XAI_CNN_POOLING_GET_STRIDEX(param)) + 1), XAI_ERR_DATASIZE,      \
+                  "Insufficient input width to generate requested output width");                                \
+                                                                                                                 \
+  /* Minimum required input height to compute requested output height */                                         \
+  XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(inT) >= ((XAI_TILE3D_GET_DIM3(outT) - 1) /                                \
+                                                XAI_CNN_POOLING_GET_STRIDEY(param)) + 1), XAI_ERR_DATASIZE,      \
+                  "Insufficient input height to generate requested output height");                              \
+                                                                                                                 \
+  if (XAI_CNN_POOLING_GET_KERNELWIDTH(param) % 2 != 0)                                                           \
+  {                                                                                                              \
+    /* Odd Width Kernel Edge Consistency */                                                                      \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(outT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) &&          \
+                     XAI_TILE3D_GET_DIM2_EDGE2(outT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)),           \
+                    XAI_ERR_EDGE, "Invalid left/right edge for odd kernel width.");                              \
+  }                                                                                                              \
+  else                                                                                                           \
+  {                                                                                                              \
+    /* Even Width Kernel Edge Consistency */                                                                     \
+    if (XAI_CNN_POOLING_GET_LEFTEDGE_FLAG(param))                                                                \
+    {                                                                                                            \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(outT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) &&        \
+                       XAI_TILE3D_GET_DIM2_EDGE2(outT) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1)),   \
+                      XAI_ERR_EDGE, "Invalid left/right edge for even kernel width with leftedge flag set");     \
+    }                                                                                                            \
+    else                                                                                                         \
+    {                                                                                                            \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(outT) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1) &&  \
+                       XAI_TILE3D_GET_DIM2_EDGE2(outT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)),         \
+                      XAI_ERR_EDGE, "Invalid left/right edge for even kernel width with leftedge flag reset");   \
+    }                                                                                                            \
+  }                                                                                                              \
+  if (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) % 2 != 0)                                                          \
+  {                                                                                                              \
+    /* Odd Height Kernel Edge Consistency */                                                                     \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(outT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) &&         \
+                     XAI_TILE3D_GET_DIM3_EDGE2(outT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)),          \
+                    XAI_ERR_EDGE, "Invalid Top/Bottom edge for odd kernel height.");                             \
+  }                                                                                                              \
+  else                                                                                                           \
+  {                                                                                                              \
+    /* Even Height Kernel Edge Consistency */                                                                    \
+    if (XAI_CNN_POOLING_GET_TOPEDGE_FLAG(param))                                                                 \
+    {                                                                                                            \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(outT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) &&       \
+                       XAI_TILE3D_GET_DIM3_EDGE2(outT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1)),  \
+                      XAI_ERR_EDGE, "Invalid top/bottom edge for even kernel height with topedge flag set");     \
+    }                                                                                                            \
+    else                                                                                                         \
+    {                                                                                                            \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(outT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1) && \
+                       XAI_TILE3D_GET_DIM3_EDGE2(outT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)),        \
+                      XAI_ERR_EDGE, "Invalid top/bottom edge for even kernel height with topedge flag reset");   \
+    }                                                                                                            \
+  }
+
+#define XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param)                                                        \
+  uint16_t dilatedKW = (uint16_t) (XAI_CNN_CONV_GET_DILATION(param) * (XAI_TILE4D_GET_DIM1(coeffTile) - 1) + 1); \
+  uint16_t dilatedKH = (uint16_t) (XAI_CNN_CONV_GET_DILATION(param) * (XAI_TILE4D_GET_DIM2(coeffTile) - 1) + 1); \
+  if (dilatedKW % 2 != 0)                                                                                        \
+  {                                                                                                              \
+    if (dilatedKH % 2 != 0)                                                                                      \
+    {                                                                                                            \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2)                                       \
+                      && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2)                                    \
+                      && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2)                                    \
+                      && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2),                                   \
+                      XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                       \
+    }                                                                                                            \
+    else                                                                                                         \
+    {                                                                                                            \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                                  \
+      {                                                                                                          \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2)                                     \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2)                                  \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2)                                  \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                         \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                     \
+      }                                                                                                          \
+      else                                                                                                       \
+      {                                                                                                          \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2)                                     \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2)                                  \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1))                          \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2),                                 \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                     \
+      }                                                                                                          \
+    }                                                                                                            \
+  }                                                                                                              \
+  else                                                                                                           \
+  {                                                                                                              \
+    if (dilatedKH % 2 != 0)                                                                                      \
+    {                                                                                                            \
+      if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                 \
+      {                                                                                                          \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2)                                     \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((dilatedKW / 2) - 1))                          \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2)                                  \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2),                                 \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                     \
+      }                                                                                                          \
+      else                                                                                                       \
+      {                                                                                                          \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1))                             \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2)                                  \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2)                                  \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2),                                 \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                     \
+      }                                                                                                          \
+    }                                                                                                            \
+    else                                                                                                         \
+    {                                                                                                            \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                                  \
+      {                                                                                                          \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                               \
+        {                                                                                                        \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (dilatedKW / 2) &&                               \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((dilatedKW / 2) - 1) &&                         \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKH / 2) &&                               \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                          \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                   \
+        }                                                                                                        \
+        else                                                                                                     \
+        {                                                                                                        \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1) &&                         \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) &&                               \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKH / 2) &&                               \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                          \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                   \
+        }                                                                                                        \
+      }                                                                                                          \
+      else                                                                                                       \
+      {                                                                                                          \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                               \
+        {                                                                                                        \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (dilatedKW / 2) &&                               \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2 - 1) &&                           \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1) &&                         \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKH / 2)),                                \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                   \
+        }                                                                                                        \
+        else                                                                                                     \
+        {                                                                                                        \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1) &&                         \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) &&                               \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1) &&                         \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKH / 2)),                                \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                   \
+        }                                                                                                        \
+      }                                                                                                          \
+    }                                                                                                            \
+  }
+
+/* outT is assumed to be ID4WH */
+#define XAI_CHECK_CONSISTENCY_MOD_ID4WH(inT, coeffT, biasArr, outT, param)                                                            \
+  {                                                                                                                                   \
+    uint16_t dilationX     = XAI_CNN_CONV_GET_DILATIONX(param);                                                                       \
+    uint16_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);                                                                       \
+    int32_t dilatedkWidth  = dilationX * (XAI_TILE4D_GET_DIM2(coeffT) - 1) + 1;                                                       \
+    int32_t dilatedkHeight = dilationY * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1;                                                       \
+    XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM2(outT) << 2) + 15) & (~15)) == (XAI_TILE4D_GET_DIM1(coeffT) >> 2), XAI_ERR_DATASIZE,        \
+                    "Number of Output Channels not equal to the number of Kernels.");                                                 \
+    XAI_CHECK_ERROR((XAI_ALIGN_VAL(XAI_ARRAY_GET_WIDTH(biasArr), 16) >= (XAI_TILE3D_GET_DIM2(outT) << 2)), XAI_ERR_DATASIZE,          \
+                    "Width of Bias Array is less than number of output channels.");                                                   \
+    if (dilatedkWidth % 2 != 0)                                                                                                       \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 2) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 2) +                                       \
+                                                              (dilatedkWidth >> 1) + (dilatedkWidth >> 1) -                           \
+                                                              dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),              \
+                      XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                            \
+    }                                                                                                                                 \
+    else                                                                                                                              \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 2) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 2) +                                       \
+                                                              (dilatedkWidth >> 1) + ((dilatedkWidth >> 1) - 1) -                     \
+                                                              dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),              \
+                      XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                            \
+    }                                                                                                                                 \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(inT) << 2) == (XAI_TILE4D_GET_DIM4(coeffT) << 2), XAI_ERR_DATASIZE,                          \
+                    "Number of Input Channels not equal to the number of channels in the Kernel.");                                   \
+    if (dilatedkHeight % 2 != 0)                                                                                                      \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + (dilatedkHeight >> 1)       \
+                                                       - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),                  \
+                      XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                           \
+    }                                                                                                                                 \
+    else                                                                                                                              \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + ((dilatedkHeight >> 1) - 1) \
+                                                       - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),                  \
+                      XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                           \
+    }                                                                                                                                 \
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE,                                                            \
+                    "Height of Bias Array should be greater than zero.");                                                             \
+  }
+
+/* outT is assumed to be ID16WH */
+#define XAI_CHECK_CONSISTENCY_MOD_ID16WH(inT, coeffT, biasArr, outT, param)                                                           \
+  {                                                                                                                                   \
+    uint16_t dilationX     = XAI_CNN_CONV_GET_DILATIONX(param);                                                                       \
+    uint16_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);                                                                       \
+    int32_t dilatedkWidth  = dilationX * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1;                                                       \
+    int32_t dilatedkHeight = dilationY * (XAI_TILE4D_GET_DIM2(coeffT) - 1) + 1;                                                       \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) << 4) == (XAI_TILE4D_GET_DIM4(coeffT) << 4), XAI_ERR_DATASIZE,                         \
+                    "Number of Output Channels not equal to the number of Kernels.");                                                 \
+    XAI_CHECK_ERROR((XAI_ALIGN_VAL(XAI_ARRAY_GET_WIDTH(biasArr), 16) >= (XAI_TILE3D_GET_DIM2(outT) << 4)), XAI_ERR_DATASIZE,          \
+                    "Width of Bias Array is less than number of output channels.");                                                   \
+    if (dilatedkWidth % 2 != 0)                                                                                                       \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 4) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 4) +                                       \
+                                                              (dilatedkWidth >> 1) + (dilatedkWidth >> 1) -                           \
+                                                              dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),              \
+                      XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                            \
+    }                                                                                                                                 \
+    else                                                                                                                              \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 4) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 4) +                                       \
+                                                              (dilatedkWidth >> 1) + ((dilatedkWidth >> 1) - 1) -                     \
+                                                              dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),              \
+                      XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                            \
+    }                                                                                                                                 \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(inT) << 4) == (XAI_TILE4D_GET_DIM1(coeffT) >> 4), XAI_ERR_DATASIZE,                          \
+                    "Number of Input Channels not equal to the number of channels in the Kernel.");                                   \
+    if (dilatedkHeight % 2 != 0)                                                                                                      \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + (dilatedkHeight >> 1)       \
+                                                       - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),                  \
+                      XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                           \
+    }                                                                                                                                 \
+    else                                                                                                                              \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + ((dilatedkHeight >> 1) - 1) \
+                                                       - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),                  \
+                      XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                           \
+    }                                                                                                                                 \
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE,                                                            \
+                    "Height of Bias Array should be greater than zero.");                                                             \
+  }
+
+#define XAI_CHECK_CONSISTENCY_MOD_ID32WH(inT, coeffT, biasArr, outT, param)                                                           \
+  {                                                                                                                                   \
+    uint16_t dilationX     = XAI_CNN_CONV_GET_DILATIONX(param);                                                                       \
+    uint16_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);                                                                       \
+    int32_t dilatedkWidth  = dilationX * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1;                                                       \
+    int32_t dilatedkHeight = dilationY * (XAI_TILE4D_GET_DIM2(coeffT) - 1) + 1;                                                       \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) << 5) == (XAI_TILE4D_GET_DIM4(coeffT) << 5), XAI_ERR_DATASIZE,                         \
+                    "Number of Output Channels not equal to the number of Kernels.");                                                 \
+    XAI_CHECK_ERROR((XAI_ALIGN_VAL(XAI_ARRAY_GET_WIDTH(biasArr), 32) >= (XAI_TILE3D_GET_DIM2(outT) << 5)), XAI_ERR_DATASIZE,          \
+                    "Width of Bias Array is less than number of output channels.");                                                   \
+    if (dilatedkWidth % 2 != 0)                                                                                                       \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 5) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 5) +                                       \
+                                                              (dilatedkWidth >> 1) + (dilatedkWidth >> 1) -                           \
+                                                              dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),              \
+                      XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                            \
+    }                                                                                                                                 \
+    else                                                                                                                              \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 5) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 5) +                                       \
+                                                              (dilatedkWidth >> 1) + ((dilatedkWidth >> 1) - 1) -                     \
+                                                              dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),              \
+                      XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                            \
+    }                                                                                                                                 \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(inT) << 5) == (XAI_TILE4D_GET_DIM1(coeffT) >> 5), XAI_ERR_DATASIZE,                          \
+                    "Number of Input Channels not equal to the number of channels in the Kernel.");                                   \
+    if (dilatedkHeight % 2 != 0)                                                                                                      \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + (dilatedkHeight >> 1)       \
+                                                       - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),                  \
+                      XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                           \
+    }                                                                                                                                 \
+    else                                                                                                                              \
+    {                                                                                                                                 \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + ((dilatedkHeight >> 1) - 1) \
+                                                       - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),                  \
+                      XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                           \
+    }                                                                                                                                 \
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE,                                                            \
+                    "Height of Bias Array should be greater than zero.");                                                             \
+  }
+// Assuming that "inTile" is in DWH format
+// Assuming that "coeffTile" is in RMOD_DWH_ID16WH format
+#define XAI_CHECK_EDGES_MOD_DWH_ID16WH(inTile, coeffTile, param)                               \
+  uint16_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);                                      \
+  uint16_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);                                  \
+  int32_t dilatedkWidth  = dilationX * (XAI_TILE4D_GET_DIM2(coeffTile) - 1) + 1;               \
+  int32_t dilatedkHeight = dilationY * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1;               \
+  if (dilatedkWidth % 2 != 0)                                                                  \
+  {                                                                                            \
+    if (dilatedkHeight % 2 != 0)                                                               \
+    {                                                                                          \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedkWidth / 2)                 \
+                      && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedkWidth / 2)              \
+                      && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedkHeight / 2)             \
+                      && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedkHeight / 2),            \
+                      XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");     \
+    }                                                                                          \
+    else                                                                                       \
+    {                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedkWidth / 2)               \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedkWidth / 2)            \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedkHeight / 2)           \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedkHeight / 2) - 1)),  \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedkWidth / 2)               \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedkWidth / 2)            \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedkHeight / 2) - 1))   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedkHeight / 2),          \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+    }                                                                                          \
+  }                                                                                            \
+  else                                                                                         \
+  {                                                                                            \
+    if (dilatedkHeight % 2 != 0)                                                               \
+    {                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                               \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedkWidth / 2)               \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedkWidth / 2) - 1)      \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedkHeight / 2)           \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedkHeight / 2),          \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedkWidth / 2) - 1))       \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedkWidth / 2)            \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedkHeight / 2)           \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedkHeight / 2),          \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+    }                                                                                          \
+    else                                                                                       \
+    {                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                \
+      {                                                                                        \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                             \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedkWidth / 2) &&         \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedkWidth / 2) - 1) &&   \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedkHeight / 2) &&        \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedkHeight / 2) - 1)),   \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+        else                                                                                   \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedkWidth / 2) - 1) &&   \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedkWidth / 2) &&         \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedkHeight / 2) &&        \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedkHeight / 2) - 1)),   \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                             \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedkWidth / 2) &&         \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedkWidth / 2) - 1) &&   \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedkHeight / 2) - 1) &&  \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedkHeight / 2)),         \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+        else                                                                                   \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedkWidth / 2) - 1) &&   \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedkWidth / 2) &&         \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedkHeight / 2) - 1) &&  \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedkHeight / 2)),         \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+      }                                                                                        \
+    }                                                                                          \
+  }
+#define XAI_CHECK_EDGES_DEPTHWISE_DILATED_MOW_WHD(inTile, coeffTile, param)                    \
+  uint16_t dilatedKW = (uint16_t) (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param) *        \
+                                   (XAI_TILE3D_GET_DIM1(coeffTile) - 1) + 1);                  \
+  uint16_t dilatedKH = (uint16_t) (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param) *        \
+                                   (XAI_TILE3D_GET_DIM2(coeffTile) - 1) + 1);                  \
+  if (dilatedKW % 2 != 0)                                                                      \
+  {                                                                                            \
+    if (dilatedKH % 2 != 0)                                                                    \
+    {                                                                                          \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2)                     \
+                      && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2)                  \
+                      && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2)                  \
+                      && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2),                 \
+                      XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");     \
+    }                                                                                          \
+    else                                                                                       \
+    {                                                                                          \
+      if (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_TOPEDGE(param))                              \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2)                   \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2)                \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2)                \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),       \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2)                   \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2)                \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1))        \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2),               \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+    }                                                                                          \
+  }                                                                                            \
+  else                                                                                         \
+  {                                                                                            \
+    if (dilatedKH % 2 != 0)                                                                    \
+    {                                                                                          \
+      if (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_LEFTEDGE(param))                             \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2)                   \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((dilatedKW / 2) - 1))        \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2)                \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2),               \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1))           \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2)                \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2)                \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2),               \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+    }                                                                                          \
+    else                                                                                       \
+    {                                                                                          \
+      if (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_TOPEDGE(param))                              \
+      {                                                                                        \
+        if (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_LEFTEDGE(param))                           \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (dilatedKW / 2) &&             \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((dilatedKW / 2) - 1) &&       \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKH / 2) &&             \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),        \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+        else                                                                                   \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1) &&       \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) &&             \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKH / 2) &&             \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),        \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        if (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_LEFTEDGE(param))                           \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (dilatedKW / 2) &&             \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2 - 1) &&         \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1) &&       \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKH / 2)),              \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+        else                                                                                   \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1) &&       \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) &&             \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1) &&       \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKH / 2)),              \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+      }                                                                                        \
+    }                                                                                          \
+  }
+
+#define XAI_CHECK_EDGES_MOD_WHD(inTile, coeffTile, param)                                                        \
+  uint16_t dilatedKW = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1); \
+  uint16_t dilatedKH = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM4(coeffTile) - 1) + 1); \
+  if (dilatedKW % 2 != 0)                                                                                        \
+  {                                                                                                              \
+    if (dilatedKH % 2 != 0)                                                                                      \
+    {                                                                                                            \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2)                                       \
+                      && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2)                                    \
+                      && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2)                                    \
+                      && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2),                                   \
+                      XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                       \
+    }                                                                                                            \
+    else                                                                                                         \
+    {                                                                                                            \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                                  \
+      {                                                                                                          \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2)                                     \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2)                                  \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2)                                  \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                         \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                     \
+      }                                                                                                          \
+      else                                                                                                       \
+      {                                                                                                          \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2)                                     \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2)                                  \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1))                          \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2),                                 \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                     \
+      }                                                                                                          \
+    }                                                                                                            \
+  }                                                                                                              \
+  else                                                                                                           \
+  {                                                                                                              \
+    if (dilatedKH % 2 != 0)                                                                                      \
+    {                                                                                                            \
+      if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                 \
+      {                                                                                                          \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2)                                     \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) - 1)                            \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2)                                  \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2),                                 \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                     \
+      }                                                                                                          \
+      else                                                                                                       \
+      {                                                                                                          \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1))                             \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2)                                  \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2)                                  \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2),                                 \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                     \
+      }                                                                                                          \
+    }                                                                                                            \
+    else                                                                                                         \
+    {                                                                                                            \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                                  \
+      {                                                                                                          \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                               \
+        {                                                                                                        \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (dilatedKW / 2) &&                               \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((dilatedKW / 2) - 1) &&                         \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKH / 2) &&                               \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                          \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                   \
+        }                                                                                                        \
+        else                                                                                                     \
+        {                                                                                                        \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1) &&                         \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) &&                               \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKH / 2) &&                               \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                          \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                   \
+        }                                                                                                        \
+      }                                                                                                          \
+      else                                                                                                       \
+      {                                                                                                          \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                               \
+        {                                                                                                        \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (dilatedKW / 2) &&                               \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((dilatedKW / 2) - 1) &&                         \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1) &&                         \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKH / 2)),                                \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                   \
+        }                                                                                                        \
+        else                                                                                                     \
+        {                                                                                                        \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1) &&                         \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) &&                               \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1) &&                         \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKH / 2)),                                \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                   \
+        }                                                                                                        \
+      }                                                                                                          \
+    }                                                                                                            \
+  }
+
+
+#define XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param)                                                         \
+  uint16_t dilatedKW = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1); \
+  uint16_t dilatedKH = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM4(coeffTile) - 1) + 1); \
+  if (dilatedKW % 2 != 0)                                                                                         \
+  {                                                                                                               \
+    if (dilatedKH % 2 != 0)                                                                                       \
+    {                                                                                                             \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2)                                        \
+                      && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2)                                     \
+                      && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2)                                     \
+                      && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2),                                    \
+                      XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                        \
+    }                                                                                                             \
+    else                                                                                                          \
+    {                                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                                   \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2)                                      \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                          \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+      else                                                                                                        \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2)                                      \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1))                           \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2),                                  \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+    }                                                                                                             \
+  }                                                                                                               \
+  else                                                                                                            \
+  {                                                                                                               \
+    if (dilatedKH % 2 != 0)                                                                                       \
+    {                                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                  \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2)                                      \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) - 1)                             \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2),                                  \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+      else                                                                                                        \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1))                              \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2),                                  \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+    }                                                                                                             \
+    else                                                                                                          \
+    {                                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                                   \
+      {                                                                                                           \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2) &&                                \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                           \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+        else                                                                                                      \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2) &&                                \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                           \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+      }                                                                                                           \
+      else                                                                                                        \
+      {                                                                                                           \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)),                                 \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+        else                                                                                                      \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)),                                 \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+      }                                                                                                           \
+    }                                                                                                             \
+  }
+
+#define XAI_CHECK_EDGES_MOD_WHD_DWH(inTile, coeffTile, param)                                                     \
+  uint16_t dilatedKW = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1); \
+  uint16_t dilatedKH = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM4(coeffTile) - 1) + 1); \
+  if (dilatedKW % 2 != 0)                                                                                         \
+  {                                                                                                               \
+    if (dilatedKH % 2 != 0)                                                                                       \
+    {                                                                                                             \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2)                                        \
+                      && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2)                                     \
+                      && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2)                                     \
+                      && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2),                                    \
+                      XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                        \
+    }                                                                                                             \
+    else                                                                                                          \
+    {                                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                                   \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2)                                      \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                          \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+      else                                                                                                        \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2)                                      \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1))                           \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2),                                  \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+    }                                                                                                             \
+  }                                                                                                               \
+  else                                                                                                            \
+  {                                                                                                               \
+    if (dilatedKH % 2 != 0)                                                                                       \
+    {                                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                  \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2)                                      \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) - 1)                             \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2),                                  \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+      else                                                                                                        \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1))                              \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2),                                  \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+    }                                                                                                             \
+    else                                                                                                          \
+    {                                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                                   \
+      {                                                                                                           \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKH / 2) &&                                \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                           \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+        else                                                                                                      \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKH / 2) &&                                \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                           \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+      }                                                                                                           \
+      else                                                                                                        \
+      {                                                                                                           \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKH / 2)),                                 \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+        else                                                                                                      \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKH / 2)),                                 \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+      }                                                                                                           \
+    }                                                                                                             \
+  }
+
+#define XAI_CHECK_TILES3D_CHECK_EDGES_QUANT(inTile, outTile)                                                                               \
+  {                                                                                                                                        \
+    if (XAI_TILE3D_GET_DATA_PTR(inTile) == XAI_TILE3D_GET_DATA_PTR(outTile))                                                               \
+    {                                                                                                                                      \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(outTile) + XAI_TILE3D_GET_DIM1_EDGE2(outTile)) <=                                        \
+                       (2 * (XAI_TILE3D_GET_DIM1_PITCH(inTile) - XAI_TILE3D_GET_DIM1(inTile)))), XAI_ERR_BADARG,                           \
+                      "Output and Input tile edges constraints have not been met along dimension 1");                                      \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_PITCH(outTile) * (XAI_TILE3D_GET_DIM2_EDGE1(outTile) + XAI_TILE3D_GET_DIM2_EDGE2(outTile))) <= \
+                       (2 * (XAI_TILE3D_GET_DIM2_PITCH(inTile) - (XAI_TILE3D_GET_DIM1_PITCH(inTile) * XAI_TILE3D_GET_DIM2(inTile))))),     \
+                      XAI_ERR_BADARG, "Output and Input tile edges constraints have not been met  along dimension 2");                     \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM3_EDGE1(outTile) + XAI_TILE3D_GET_DIM3_EDGE2(outTile)) <=                                        \
+                       (2 * (XAI_TILE3D_GET_DIM3_EDGE1(inTile) + XAI_TILE3D_GET_DIM3_EDGE2(inTile)))), XAI_ERR_BADARG,                     \
+                      "Output and Input tile edges constraints have not been met  along dimension 3");                                     \
+      XAI_CHECK_ERROR(((size_t) (XAI_TILE3D_GET_BUFF_PTR(inTile)) <= ((size_t) (XAI_TILE3D_GET_BUFF_PTR(outTile)))), XAI_ERR_BADARG,       \
+                      "Output tile buffer pointer should be greater than or equal to input tile buffer pointer");                          \
+    }                                                                                                                                      \
+  }
+#define XAI_CHECK_TILES4D_CHECK_EDGES_QUANT(inTile, outTile)                                                                               \
+  {                                                                                                                                        \
+    if (XAI_TILE4D_GET_DATA_PTR(inTile) == XAI_TILE4D_GET_DATA_PTR(outTile))                                                               \
+    {                                                                                                                                      \
+      XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM1_EDGE1(outTile) + XAI_TILE4D_GET_DIM1_EDGE2(outTile)) <=                                        \
+                       (2 * (XAI_TILE4D_GET_DIM1_PITCH(inTile) - XAI_TILE4D_GET_DIM1(inTile)))), XAI_ERR_BADARG,                           \
+                      "Output and Input tile edges constraints have not been met along dimension 1");                                      \
+      XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM1_PITCH(outTile) * (XAI_TILE4D_GET_DIM2_EDGE1(outTile) + XAI_TILE4D_GET_DIM2_EDGE2(outTile))) <= \
+                       (2 * (XAI_TILE4D_GET_DIM2_PITCH(inTile) - (XAI_TILE3D_GET_DIM1_PITCH(inTile) * XAI_TILE4D_GET_DIM2(inTile))))),     \
+                      XAI_ERR_BADARG, "Output and Input tile edges constraints have not been met  along dimension 2");                     \
+      XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM2_PITCH(outTile) * (XAI_TILE4D_GET_DIM3_EDGE1(outTile) + XAI_TILE4D_GET_DIM3_EDGE2(outTile))) <= \
+                       (2 * (XAI_TILE4D_GET_DIM3_PITCH(inTile) - (XAI_TILE3D_GET_DIM2_PITCH(inTile) * XAI_TILE4D_GET_DIM3(inTile))))),     \
+                      XAI_ERR_BADARG, "Output and Input tile edges constraints have not been met  along dimension 3");                     \
+      XAI_CHECK_ERROR(((size_t) (XAI_TILE4D_GET_BUFF_PTR(inTile)) <= ((size_t) (XAI_TILE4D_GET_BUFF_PTR(outTile)))), XAI_ERR_BADARG,       \
+                      "Output tile buffer pointer should be greater than or equal to input tile buffer pointer");                          \
+    }                                                                                                                                      \
+  }
+#define XAI_CHECK_TILES3D_CHECK_EDGES_DEQUANT(inTile, outTile)                                                                            \
+  {                                                                                                                                       \
+    if (XAI_TILE3D_GET_DATA_PTR(inTile) == XAI_TILE3D_GET_DATA_PTR(outTile))                                                              \
+    {                                                                                                                                     \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile)) <=                                         \
+                       (2 * (XAI_TILE3D_GET_DIM1_PITCH(outTile) - XAI_TILE3D_GET_DIM1(outTile)))), XAI_ERR_BADARG,                        \
+                      "Output and Input tile edges constraints have not been met along dimension 1");                                     \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_PITCH(inTile) * (XAI_TILE3D_GET_DIM2_EDGE1(inTile) + XAI_TILE3D_GET_DIM2_EDGE2(inTile))) <=   \
+                       (2 * (XAI_TILE3D_GET_DIM2_PITCH(outTile) - (XAI_TILE3D_GET_DIM1_PITCH(outTile) * XAI_TILE3D_GET_DIM2(outTile))))), \
+                      XAI_ERR_BADARG, "Output and Input tile edges constraints have not been met  along dimension 2");                    \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM3_EDGE1(inTile) + XAI_TILE3D_GET_DIM3_EDGE2(inTile)) <=                                         \
+                       (2 * (XAI_TILE3D_GET_DIM3_EDGE1(outTile) + XAI_TILE3D_GET_DIM3_EDGE2(outTile)))), XAI_ERR_BADARG,                  \
+                      "Output and Input tile edges constraints have not been met  along dimension 3");                                    \
+      XAI_CHECK_ERROR(((size_t) (XAI_TILE3D_GET_BUFF_PTR(outTile)) <= ((size_t) (XAI_TILE3D_GET_BUFF_PTR(inTile)))), XAI_ERR_BADARG,      \
+                      "Input tile buffer pointer should be greater than or equal to output tile buffer pointer");                         \
+    }                                                                                                                                     \
+  }
+#define XAI_CHECK_TILES4D_CHECK_EDGES_DEQUANT(inTile, outTile)                                                                            \
+  {                                                                                                                                       \
+    if (XAI_TILE4D_GET_DATA_PTR(inTile) == XAI_TILE4D_GET_DATA_PTR(outTile))                                                              \
+    {                                                                                                                                     \
+      XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM1_EDGE1(inTile) + XAI_TILE4D_GET_DIM1_EDGE2(inTile)) <=                                         \
+                       (2 * (XAI_TILE4D_GET_DIM1_PITCH(outTile) - XAI_TILE4D_GET_DIM1(outTile)))), XAI_ERR_BADARG,                        \
+                      "Output and Input tile edges constraints have not been met along dimension 1");                                     \
+      XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM1_PITCH(inTile) * (XAI_TILE4D_GET_DIM2_EDGE1(inTile) + XAI_TILE4D_GET_DIM2_EDGE2(inTile))) <=   \
+                       (2 * (XAI_TILE4D_GET_DIM2_PITCH(outTile) - (XAI_TILE3D_GET_DIM1_PITCH(outTile) * XAI_TILE4D_GET_DIM2(outTile))))), \
+                      XAI_ERR_BADARG, "Output and Input tile edges constraints have not been met  along dimension 2");                    \
+      XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM2_PITCH(inTile) * (XAI_TILE4D_GET_DIM3_EDGE1(inTile) + XAI_TILE4D_GET_DIM3_EDGE2(inTile))) <=   \
+                       (2 * (XAI_TILE4D_GET_DIM3_PITCH(outTile) - (XAI_TILE3D_GET_DIM2_PITCH(outTile) * XAI_TILE4D_GET_DIM3(outTile))))), \
+                      XAI_ERR_BADARG, "Output and Input tile edges constraints have not been met  along dimension 3");                    \
+      XAI_CHECK_ERROR(((size_t) (XAI_TILE4D_GET_BUFF_PTR(outTile)) <= ((size_t) (XAI_TILE4D_GET_BUFF_PTR(inTile)))), XAI_ERR_BADARG,      \
+                      "Input tile buffer pointer should be greater than or equal to output tile buffer pointer");                         \
+    }                                                                                                                                     \
+  }
+
+#define XAI_CHECK_EDGES_MOD_DWH_IN16DWH(inTile, coeffTile, param)                                                 \
+  uint16_t dilatedKW = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM2(coeffTile) - 1) + 1); \
+  uint16_t dilatedKH = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1); \
+  if (dilatedKW % 2 != 0)                                                                                         \
+  {                                                                                                               \
+    if (dilatedKH % 2 != 0)                                                                                       \
+    {                                                                                                             \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2)                                        \
+                      && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2)                                     \
+                      && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2)                                     \
+                      && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2),                                    \
+                      XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                        \
+    }                                                                                                             \
+    else                                                                                                          \
+    {                                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                                   \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2)                                      \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                          \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+      else                                                                                                        \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2)                                      \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1))                           \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2),                                  \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+    }                                                                                                             \
+  }                                                                                                               \
+  else                                                                                                            \
+  {                                                                                                               \
+    if (dilatedKH % 2 != 0)                                                                                       \
+    {                                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                  \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2)                                      \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) - 1)                             \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2),                                  \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+      else                                                                                                        \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1))                              \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2),                                  \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+    }                                                                                                             \
+    else                                                                                                          \
+    {                                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                                   \
+      {                                                                                                           \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2) &&                                \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                           \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+        else                                                                                                      \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2) &&                                \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                           \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+      }                                                                                                           \
+      else                                                                                                        \
+      {                                                                                                           \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)),                                 \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+        else                                                                                                      \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)),                                 \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+      }                                                                                                           \
+    }                                                                                                             \
+  }
+
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+#define XAI_CHECK_EDGES_F16_MOD_DWH(inTile, coeffTile, param)                                                     \
+  uint16_t dilatedKW = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1); \
+  uint16_t dilatedKH = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM4(coeffTile) - 1) + 1); \
+  if (dilatedKW % 2 != 0)                                                                                         \
+  {                                                                                                               \
+    if (dilatedKH % 2 != 0)                                                                                       \
+    {                                                                                                             \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2)                                        \
+                      && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2)                                     \
+                      && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2)                                     \
+                      && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2),                                    \
+                      XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                        \
+    }                                                                                                             \
+    else                                                                                                          \
+    {                                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                                   \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2)                                      \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                          \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+      else                                                                                                        \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2)                                      \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1))                           \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2),                                  \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+    }                                                                                                             \
+  }                                                                                                               \
+  else                                                                                                            \
+  {                                                                                                               \
+    if (dilatedKH % 2 != 0)                                                                                       \
+    {                                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                  \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2)                                      \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) - 1)                             \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2),                                  \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+      else                                                                                                        \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1))                              \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2),                                  \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+    }                                                                                                             \
+    else                                                                                                          \
+    {                                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                                   \
+      {                                                                                                           \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2) &&                                \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                           \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+        else                                                                                                      \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2) &&                                \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                           \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+      }                                                                                                           \
+      else                                                                                                        \
+      {                                                                                                           \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)),                                 \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+        else                                                                                                      \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)),                                 \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+      }                                                                                                           \
+    }                                                                                                             \
+  }
+
+#define XAI_CHECK_EDGES_DEPTHWISE_F16_MOD_DWH(inTile, coeffTile, param)                        \
+  int32_t kW = XAI_TILE3D_GET_DIM2(coeffTile);                                                 \
+  int32_t kH = XAI_TILE3D_GET_DIM3(coeffTile);                                                 \
+  if (kW % 2 != 0)                                                                             \
+  {                                                                                            \
+    if (kH % 2 != 0)                                                                           \
+    {                                                                                          \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2)                            \
+                      && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2)                         \
+                      && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2)                         \
+                      && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2),                        \
+                      XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");     \
+    }                                                                                          \
+    else                                                                                       \
+    {                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2)                          \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)),              \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2)                          \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1))               \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2),                      \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+    }                                                                                          \
+  }                                                                                            \
+  else                                                                                         \
+  {                                                                                            \
+    if (kH % 2 != 0)                                                                           \
+    {                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                               \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2)                          \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) - 1)                 \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2),                      \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1))                  \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2),                      \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+    }                                                                                          \
+    else                                                                                       \
+    {                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                \
+      {                                                                                        \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                             \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kW / 2) &&                    \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kW / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kH / 2) &&                    \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)),               \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+        else                                                                                   \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) &&                    \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kH / 2) &&                    \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)),               \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                             \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kW / 2) &&                    \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kW / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kH / 2)),                     \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+        else                                                                                   \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) &&                    \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kH / 2)),                     \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+      }                                                                                        \
+    }                                                                                          \
+  }
+#define XAI_CHECK_CONSISTENCY_DEPTHWISE_F16_MOD_DWH(inT, coeffT, biasArr, outT, param)                                           \
+  int32_t KW_MOD = XAI_TILE3D_GET_DIM2(coeffT);                                                                                  \
+  int32_t KH_MOD = XAI_TILE3D_GET_DIM3(coeffT);                                                                                  \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                     \
+                  "Number of Input Channels not equal to the number of channels in the Kernel.");                                \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                    \
+                  "Number of Output Channels not equal to the number of channels in the Kernel.");                               \
+  if (KW_MOD % 2 != 0)                                                                                                           \
+  {                                                                                                                              \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1)                                    \
+                                                     + (KW_MOD >> 1) - KW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                         \
+  }                                                                                                                              \
+  else                                                                                                                           \
+  {                                                                                                                              \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1)                                    \
+                                                     + ((KW_MOD >> 1) - 1) - KW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                         \
+  }                                                                                                                              \
+  if (KH_MOD % 2 != 0)                                                                                                           \
+  {                                                                                                                              \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1)                                    \
+                                                     + (KH_MOD >> 1) - KH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                        \
+  }                                                                                                                              \
+  else                                                                                                                           \
+  {                                                                                                                              \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1)                                    \
+                                                     + ((KH_MOD >> 1) - 1) - KH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                        \
+  }                                                                                                                              \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                 \
+                  "Width of Bias Array is less than number of channels in the Kernel.");                                         \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArr) > 0, XAI_ERR_DATASIZE,                                                           \
+                  "Height of Bias Array should be greater than zero.");
+#define XAI_CHECK_CONV_RELU_LIMITS_F16(param, outTile)  {                                                                                                                                   \
+    if (XAI_CNN_CONV_GET_FLAG_RELU(param))                                                                                                                                                  \
+    {                                                                                                                                                                                       \
+      XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN_FLT(param) <= XAI_CNN_CONV_GET_RELU_MAX_FLT(param)), XAI_ERR_BADARG,                                                                       \
+                      "\nMinimum Value of RELU = %f,\nMaximum Value of RELU = %f , Min Limit should not be greater than Max Limit",                                                         \
+                      CONVERT_FP16_TO_FP32(XAI_CNN_CONV_GET_RELU_MIN_FLT(param)), CONVERT_FP16_TO_FP32(XAI_CNN_CONV_GET_RELU_MAX_FLT(param)));                                              \
+      XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN_FLT(param) >= XAI_F16_MIN &&                                                                                                               \
+                       XAI_CNN_CONV_GET_RELU_MAX_FLT(param) <= XAI_F16_MAX), XAI_ERR_BADARG,                                                                                                \
+                      "\nMinimum Value of RELU = %f, value should be greater than or equal to XAI_F16_MIN \nMaximum Value of RELU = %f, value should be less than or equal to XAI_F16_MAX", \
+                      CONVERT_FP16_TO_FP32(XAI_CNN_CONV_GET_RELU_MIN_FLT(param)), CONVERT_FP16_TO_FP32(XAI_CNN_CONV_GET_RELU_MAX_FLT(param)));                                              \
+    }                                                                                                                                                                                       \
+}
+#endif //if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+
+#if (XCHAL_HAVE_VISION_SP_VFPU == 1 || XCHAL_HAVE_BBENEP_SP_VFPU == 1 || defined(XAI_REF_ONLY_COMPILATION))
+#define XAI_CHECK_EDGES_F32_MOD_DWH(inTile, coeffTile, param)                                                     \
+  uint16_t dilatedKW = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1); \
+  uint16_t dilatedKH = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM4(coeffTile) - 1) + 1); \
+  if (dilatedKW % 2 != 0)                                                                                         \
+  {                                                                                                               \
+    if (dilatedKH % 2 != 0)                                                                                       \
+    {                                                                                                             \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2)                                        \
+                      && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2)                                     \
+                      && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2)                                     \
+                      && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2),                                    \
+                      XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                        \
+    }                                                                                                             \
+    else                                                                                                          \
+    {                                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                                   \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2)                                      \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                          \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+      else                                                                                                        \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2)                                      \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1))                           \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2),                                  \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+    }                                                                                                             \
+  }                                                                                                               \
+  else                                                                                                            \
+  {                                                                                                               \
+    if (dilatedKH % 2 != 0)                                                                                       \
+    {                                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                  \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2)                                      \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) - 1)                             \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2),                                  \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+      else                                                                                                        \
+      {                                                                                                           \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1))                              \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2)                                   \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2),                                  \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                      \
+      }                                                                                                           \
+    }                                                                                                             \
+    else                                                                                                          \
+    {                                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                                   \
+      {                                                                                                           \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2) &&                                \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                           \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+        else                                                                                                      \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2) &&                                \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)),                           \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+      }                                                                                                           \
+      else                                                                                                        \
+      {                                                                                                           \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)),                                 \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+        else                                                                                                      \
+        {                                                                                                         \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) &&                                \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1) &&                          \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)),                                 \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                    \
+        }                                                                                                         \
+      }                                                                                                           \
+    }                                                                                                             \
+  }
+#define XAI_CHECK_EDGES_DEPTHWISE_F32_MOD_DWH(inTile, coeffTile, param)                        \
+  int32_t kW = XAI_TILE3D_GET_DIM2(coeffTile);                                                 \
+  int32_t kH = XAI_TILE3D_GET_DIM3(coeffTile);                                                 \
+  if (kW % 2 != 0)                                                                             \
+  {                                                                                            \
+    if (kH % 2 != 0)                                                                           \
+    {                                                                                          \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2)                            \
+                      && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2)                         \
+                      && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2)                         \
+                      && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2),                        \
+                      XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");     \
+    }                                                                                          \
+    else                                                                                       \
+    {                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2)                          \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)),              \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2)                          \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1))               \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2),                      \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+    }                                                                                          \
+  }                                                                                            \
+  else                                                                                         \
+  {                                                                                            \
+    if (kH % 2 != 0)                                                                           \
+    {                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                               \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2)                          \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) - 1)                 \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2),                      \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1))                  \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2),                      \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+    }                                                                                          \
+    else                                                                                       \
+    {                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                \
+      {                                                                                        \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                             \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kW / 2) &&                    \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kW / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kH / 2) &&                    \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)),               \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+        else                                                                                   \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) &&                    \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kH / 2) &&                    \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)),               \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                             \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kW / 2) &&                    \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kW / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kH / 2)),                     \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+        else                                                                                   \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) &&                    \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kH / 2)),                     \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+      }                                                                                        \
+    }                                                                                          \
+  }
+#define XAI_CHECK_CONSISTENCY_DEPTHWISE_F32_MOD_DWH(inT, coeffT, biasArr, outT, param)                                           \
+  int32_t KW_MOD = XAI_TILE3D_GET_DIM2(coeffT);                                                                                  \
+  int32_t KH_MOD = XAI_TILE3D_GET_DIM3(coeffT);                                                                                  \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                     \
+                  "Number of Input Channels not equal to the number of channels in the Kernel.");                                \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                    \
+                  "Number of Output Channels not equal to the number of channels in the Kernel.");                               \
+  if (KW_MOD % 2 != 0)                                                                                                           \
+  {                                                                                                                              \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1)                                    \
+                                                     + (KW_MOD >> 1) - KW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                         \
+  }                                                                                                                              \
+  else                                                                                                                           \
+  {                                                                                                                              \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1)                                    \
+                                                     + ((KW_MOD >> 1) - 1) - KW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                         \
+  }                                                                                                                              \
+  if (KH_MOD % 2 != 0)                                                                                                           \
+  {                                                                                                                              \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1)                                    \
+                                                     + (KH_MOD >> 1) - KH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                        \
+  }                                                                                                                              \
+  else                                                                                                                           \
+  {                                                                                                                              \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1)                                    \
+                                                     + ((KH_MOD >> 1) - 1) - KH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                        \
+  }                                                                                                                              \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                 \
+                  "Width of Bias Array is less than number of channels in the Kernel.");                                         \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArr) > 0, XAI_ERR_DATASIZE,                                                           \
+                  "Height of Bias Array should be greater than zero.");
+#define XAI_CHECK_CONV_RELU_LIMITS_F32(param, outTile)  {                                                                                                                                           \
+    if (XAI_CNN_CONV_GET_FLAG_RELU(param))                                                                                                                                                          \
+    {                                                                                                                                                                                               \
+      XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN_FLT32(param) <= XAI_CNN_CONV_GET_RELU_MAX_FLT32(param)), XAI_ERR_BADARG,                                                                           \
+                      "\nMinimum Value of RELU = %f,\nMaximum Value of RELU = %f , Min Limit should not be greater than Max Limit",                                                                 \
+                      XAI_CNN_CONV_GET_RELU_MIN_FLT32(param), XAI_CNN_CONV_GET_RELU_MAX_FLT32(param));                                                                                              \
+      XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN_FLT32(param) >= XAI_F32_MIN_FLT &&                                                                                                                 \
+                       XAI_CNN_CONV_GET_RELU_MAX_FLT32(param) <= XAI_F32_MAX_FLT), XAI_ERR_BADARG,                                                                                                  \
+                      "\nMinimum Value of RELU = %f, value should be greater than or equal to XAI_F32_MIN_FLT \nMaximum Value of RELU = %f, value should be less than or equal to XAI_F32_MAX_FLT", \
+                      XAI_CNN_CONV_GET_RELU_MIN_FLT32(param), XAI_CNN_CONV_GET_RELU_MAX_FLT32(param));                                                                                              \
+    }                                                                                                                                                                                               \
+}
+#endif //if (XCHAL_HAVE_VISION_SP_VFPU == 1 || XCHAL_HAVE_BBENEP_SP_VFPU == 1 || defined(XAI_REF_ONLY_COMPILATION))
+
+#define XAI_CHECK_EDGES_SO(inTile, coeffTile, param)                                                              \
+  uint16_t dilatedKW = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM2(coeffTile) - 1) + 1); \
+  uint16_t dilatedKH = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1); \
+  if (dilatedKW % 2 != 0)                                                                                         \
+  {                                                                                                               \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2)) &&                                     \
+                    (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2)),                                       \
+                    XAI_ERR_EDGE, "Invalid edge for odd kernel size");                                            \
+  }                                                                                                               \
+  else                                                                                                            \
+  {                                                                                                               \
+    if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                                    \
+    {                                                                                                             \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2)) &&                                   \
+                      (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2 - 1)),                                 \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag set");                 \
+    }                                                                                                             \
+    else                                                                                                          \
+    {                                                                                                             \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2 - 1)) &&                               \
+                      (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2)),                                     \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag reset");               \
+    }                                                                                                             \
+  }                                                                                                               \
+  if (dilatedKH % 2 != 0)                                                                                         \
+  {                                                                                                               \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) &&                                       \
+                    (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2),                                         \
+                    XAI_ERR_EDGE, "Invalid edge for odd kernel size");                                            \
+  }                                                                                                               \
+  else                                                                                                            \
+  {                                                                                                               \
+    if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                                     \
+    {                                                                                                             \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2)) &&                                   \
+                      (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2 - 1)),                                 \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag set");                  \
+    }                                                                                                             \
+    else                                                                                                          \
+    {                                                                                                             \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2 - 1)) &&                               \
+                      (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)),                                     \
+                      XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag reset");                \
+    }                                                                                                             \
+  }                                                                                                               \
+
+#define XAI_CHECK_EDGES_MOD_ID16WH(inTile, coeffT, param)                                         \
+  int32_t kWidthMOD, kHeightMOD;                                                                  \
+  uint16_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);                                         \
+  uint16_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);                                         \
+  kWidthMOD  = dilationX * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1;                                 \
+  kHeightMOD = dilationY * (XAI_TILE4D_GET_DIM2(coeffT) - 1) + 1;                                 \
+  if (kWidthMOD % 2 != 0)                                                                         \
+  {                                                                                               \
+    if (kHeightMOD % 2 != 0)                                                                      \
+    {                                                                                             \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= kWidthMOD / 2)                 \
+                      && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= kWidthMOD / 2)              \
+                      && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2)                    \
+                      && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2),                   \
+                      XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");        \
+    }                                                                                             \
+    else                                                                                          \
+    {                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                   \
+      {                                                                                           \
+        XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= kWidthMOD / 2)               \
+                        && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= kWidthMOD / 2)            \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2)                  \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1)),         \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");      \
+      }                                                                                           \
+      else                                                                                        \
+      {                                                                                           \
+        XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= kWidthMOD / 2)               \
+                        && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= kWidthMOD / 2)            \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1))          \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2),                 \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");      \
+      }                                                                                           \
+    }                                                                                             \
+  }                                                                                               \
+  else                                                                                            \
+  {                                                                                               \
+    if (kHeightMOD % 2 != 0)                                                                      \
+    {                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                  \
+      {                                                                                           \
+        XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= kWidthMOD / 2)               \
+                        && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= (kWidthMOD / 2) - 1)      \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2)                  \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2),                 \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");      \
+      }                                                                                           \
+      else                                                                                        \
+      {                                                                                           \
+        XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= ((kWidthMOD / 2) - 1))       \
+                        && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= kWidthMOD / 2)            \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2)                  \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2),                 \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");      \
+      }                                                                                           \
+    }                                                                                             \
+    else                                                                                          \
+    {                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                   \
+      {                                                                                           \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                \
+        {                                                                                         \
+          XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= (kWidthMOD / 2)) &&       \
+                           ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) && \
+                           (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kHeightMOD / 2)) &&             \
+                           (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1))),        \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");    \
+        }                                                                                         \
+        else                                                                                      \
+        {                                                                                         \
+          XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) && \
+                           ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= (kWidthMOD / 2)) &&       \
+                           (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kHeightMOD / 2)) &&             \
+                           (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1))),        \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");    \
+        }                                                                                         \
+      }                                                                                           \
+      else                                                                                        \
+      {                                                                                           \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                \
+        {                                                                                         \
+          XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= (kWidthMOD / 2)) &&       \
+                           ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) && \
+                           (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) &&       \
+                           (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kHeightMOD / 2))),              \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");    \
+        }                                                                                         \
+        else                                                                                      \
+        {                                                                                         \
+          XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) && \
+                           ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= (kWidthMOD / 2)) &&       \
+                           (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) &&       \
+                           (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kHeightMOD / 2))),              \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");    \
+        }                                                                                         \
+      }                                                                                           \
+    }                                                                                             \
+  }
+
+#define XAI_CHECK_EDGES_MOD_ID32WH(inTile, coeffT, param)                                         \
+  int32_t kWidthMOD, kHeightMOD;                                                                  \
+  uint16_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);                                         \
+  uint16_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);                                         \
+  kWidthMOD  = dilationX * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1;                                 \
+  kHeightMOD = dilationY * (XAI_TILE4D_GET_DIM2(coeffT) - 1) + 1;                                 \
+  if (kWidthMOD % 2 != 0)                                                                         \
+  {                                                                                               \
+    if (kHeightMOD % 2 != 0)                                                                      \
+    {                                                                                             \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= kWidthMOD / 2)                 \
+                      && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= kWidthMOD / 2)              \
+                      && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2)                    \
+                      && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2),                   \
+                      XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");        \
+    }                                                                                             \
+    else                                                                                          \
+    {                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                   \
+      {                                                                                           \
+        XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= kWidthMOD / 2)               \
+                        && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= kWidthMOD / 2)            \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2)                  \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1)),         \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");      \
+      }                                                                                           \
+      else                                                                                        \
+      {                                                                                           \
+        XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= kWidthMOD / 2)               \
+                        && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= kWidthMOD / 2)            \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1))          \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2),                 \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");      \
+      }                                                                                           \
+    }                                                                                             \
+  }                                                                                               \
+  else                                                                                            \
+  {                                                                                               \
+    if (kHeightMOD % 2 != 0)                                                                      \
+    {                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                  \
+      {                                                                                           \
+        XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= kWidthMOD / 2)               \
+                        && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= (kWidthMOD / 2) - 1)      \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2)                  \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2),                 \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");      \
+      }                                                                                           \
+      else                                                                                        \
+      {                                                                                           \
+        XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= ((kWidthMOD / 2) - 1))       \
+                        && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= kWidthMOD / 2)            \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2)                  \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2),                 \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");      \
+      }                                                                                           \
+    }                                                                                             \
+    else                                                                                          \
+    {                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                   \
+      {                                                                                           \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                \
+        {                                                                                         \
+          XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= (kWidthMOD / 2)) &&       \
+                           ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) && \
+                           (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kHeightMOD / 2)) &&             \
+                           (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1))),        \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");    \
+        }                                                                                         \
+        else                                                                                      \
+        {                                                                                         \
+          XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) && \
+                           ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= (kWidthMOD / 2)) &&       \
+                           (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kHeightMOD / 2)) &&             \
+                           (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1))),        \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");    \
+        }                                                                                         \
+      }                                                                                           \
+      else                                                                                        \
+      {                                                                                           \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                \
+        {                                                                                         \
+          XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= (kWidthMOD / 2)) &&       \
+                           ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) && \
+                           (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) &&       \
+                           (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kHeightMOD / 2))),              \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");    \
+        }                                                                                         \
+        else                                                                                      \
+        {                                                                                         \
+          XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) && \
+                           ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= (kWidthMOD / 2)) &&       \
+                           (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) &&       \
+                           (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kHeightMOD / 2))),              \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");    \
+        }                                                                                         \
+      }                                                                                           \
+    }                                                                                             \
+  }
+#define XAI_CHECK_EDGES_DEPTHWISE_MOD_ID16WH(inTile, coeffT, param)                               \
+  int32_t kWidthMOD, kHeightMOD;                                                                  \
+  uint16_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);                                         \
+  uint16_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);                                         \
+  kWidthMOD  = dilationX * (XAI_TILE3D_GET_DIM2(coeffT) - 1) + 1;                                 \
+  kHeightMOD = dilationY * (XAI_TILE3D_GET_DIM3(coeffT) - 1) + 1;                                 \
+  if (kWidthMOD % 2 != 0)                                                                         \
+  {                                                                                               \
+    if (kHeightMOD % 2 != 0)                                                                      \
+    {                                                                                             \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= kWidthMOD / 2)                 \
+                      && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= kWidthMOD / 2)              \
+                      && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2)                    \
+                      && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2),                   \
+                      XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");        \
+    }                                                                                             \
+    else                                                                                          \
+    {                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                   \
+      {                                                                                           \
+        XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= kWidthMOD / 2)               \
+                        && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= kWidthMOD / 2)            \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2)                  \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1)),         \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");      \
+      }                                                                                           \
+      else                                                                                        \
+      {                                                                                           \
+        XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= kWidthMOD / 2)               \
+                        && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= kWidthMOD / 2)            \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1))          \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2),                 \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");      \
+      }                                                                                           \
+    }                                                                                             \
+  }                                                                                               \
+  else                                                                                            \
+  {                                                                                               \
+    if (kHeightMOD % 2 != 0)                                                                      \
+    {                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                  \
+      {                                                                                           \
+        XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= kWidthMOD / 2)               \
+                        && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= (kWidthMOD / 2) - 1)      \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2)                  \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2),                 \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");      \
+      }                                                                                           \
+      else                                                                                        \
+      {                                                                                           \
+        XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= ((kWidthMOD / 2) - 1))       \
+                        && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= kWidthMOD / 2)            \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2)                  \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2),                 \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");      \
+      }                                                                                           \
+    }                                                                                             \
+    else                                                                                          \
+    {                                                                                             \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                   \
+      {                                                                                           \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                \
+        {                                                                                         \
+          XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= (kWidthMOD / 2)) &&       \
+                           ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) && \
+                           (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kHeightMOD / 2)) &&             \
+                           (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1))),        \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");    \
+        }                                                                                         \
+        else                                                                                      \
+        {                                                                                         \
+          XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) && \
+                           ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= (kWidthMOD / 2)) &&       \
+                           (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kHeightMOD / 2)) &&             \
+                           (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1))),        \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");    \
+        }                                                                                         \
+      }                                                                                           \
+      else                                                                                        \
+      {                                                                                           \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                \
+        {                                                                                         \
+          XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= (kWidthMOD / 2)) &&       \
+                           ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) && \
+                           (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) &&       \
+                           (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kHeightMOD / 2))),              \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");    \
+        }                                                                                         \
+        else                                                                                      \
+        {                                                                                         \
+          XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) && \
+                           ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= (kWidthMOD / 2)) &&       \
+                           (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) &&       \
+                           (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kHeightMOD / 2))),              \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");    \
+        }                                                                                         \
+      }                                                                                           \
+    }                                                                                             \
+  }
+
+#define XAI_CHECK_CONSISTENCY_DEPTHWISE_MOD_ID16WH(inTile, coeffT, outTile, param)                                                                                        \
+  {                                                                                                                                                                       \
+    uint16_t dilationX     = XAI_CNN_CONV_GET_DILATIONX(param);                                                                                                           \
+    uint16_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);                                                                                                           \
+    int32_t dilatedkWidth  = dilationX * (XAI_TILE3D_GET_DIM2(coeffT) - 1) + 1;                                                                                           \
+    int32_t dilatedkHeight = dilationY * (XAI_TILE3D_GET_DIM3(coeffT) - 1) + 1;                                                                                           \
+    XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM2(inTile) << 4) == (XAI_TILE3D_GET_DIM2(outTile) << 4)),                                                                          \
+                    XAI_ERR_DATASIZE, "Number of input and output channel should be equal.");                                                                             \
+    if (dilatedkWidth % 2 != 0)                                                                                                                                           \
+    {                                                                                                                                                                     \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outTile) >> 4) <= ((((XAI_TILE3D_GET_DIM1(inTile) >> 4) +                                                                     \
+                                                                 (dilatedkWidth >> 1) + (dilatedkWidth >> 1) - dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \
+                      XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                                                \
+    }                                                                                                                                                                     \
+    else                                                                                                                                                                  \
+    {                                                                                                                                                                     \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outTile) >> 4) <= ((((XAI_TILE3D_GET_DIM1(inTile) >> 4) +                                                                     \
+                                                                 (dilatedkWidth >> 1) + ((dilatedkWidth >> 1) - 1) -                                                      \
+                                                                 dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),                                               \
+                      XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                                                \
+    }                                                                                                                                                                     \
+    if (dilatedkHeight % 2 != 0)                                                                                                                                          \
+    {                                                                                                                                                                     \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outTile) <= (((XAI_TILE3D_GET_DIM3(inTile) + (dilatedkHeight >> 1) +                                                           \
+                                                          (dilatedkHeight >> 1) - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),                             \
+                      XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                                               \
+    }                                                                                                                                                                     \
+    else                                                                                                                                                                  \
+    {                                                                                                                                                                     \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outTile) <= (((XAI_TILE3D_GET_DIM3(inTile) + (dilatedkHeight >> 1) +                                                           \
+                                                          ((dilatedkHeight >> 1) - 1) - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),                       \
+                      XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                                               \
+    }                                                                                                                                                                     \
+  }
+
+#define XAI_CHECK_EDGES_DEPTHWISE_MOD_ID32WH(inTile, coeffT, param)  {                              \
+    int32_t kWidthMOD, kHeightMOD;                                                                  \
+    uint16_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param);                                         \
+    uint16_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param);                                         \
+    kWidthMOD  = dilationX * (XAI_TILE3D_GET_DIM2(coeffT) - 1) + 1;                                 \
+    kHeightMOD = dilationY * (XAI_TILE3D_GET_DIM3(coeffT) - 1) + 1;                                 \
+    if (kWidthMOD % 2 != 0)                                                                         \
+    {                                                                                               \
+      if (kHeightMOD % 2 != 0)                                                                      \
+      {                                                                                             \
+        XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= kWidthMOD / 2)                 \
+                        && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= kWidthMOD / 2)              \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2)                    \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2),                   \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");        \
+      }                                                                                             \
+      else                                                                                          \
+      {                                                                                             \
+        if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                   \
+        {                                                                                           \
+          XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= kWidthMOD / 2)               \
+                          && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= kWidthMOD / 2)            \
+                          && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2)                  \
+                          && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1)),         \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");      \
+        }                                                                                           \
+        else                                                                                        \
+        {                                                                                           \
+          XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= kWidthMOD / 2)               \
+                          && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= kWidthMOD / 2)            \
+                          && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1))          \
+                          && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2),                 \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");      \
+        }                                                                                           \
+      }                                                                                             \
+    }                                                                                               \
+    else                                                                                            \
+    {                                                                                               \
+      if (kHeightMOD % 2 != 0)                                                                      \
+      {                                                                                             \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                  \
+        {                                                                                           \
+          XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= kWidthMOD / 2)               \
+                          && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= (kWidthMOD / 2) - 1)      \
+                          && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2)                  \
+                          && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2),                 \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");      \
+        }                                                                                           \
+        else                                                                                        \
+        {                                                                                           \
+          XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= ((kWidthMOD / 2) - 1))       \
+                          && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= kWidthMOD / 2)            \
+                          && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2)                  \
+                          && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2),                 \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");      \
+        }                                                                                           \
+      }                                                                                             \
+      else                                                                                          \
+      {                                                                                             \
+        if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                   \
+        {                                                                                           \
+          if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                \
+          {                                                                                         \
+            XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= (kWidthMOD / 2)) &&       \
+                             ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) && \
+                             (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kHeightMOD / 2)) &&             \
+                             (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1))),        \
+                            XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");    \
+          }                                                                                         \
+          else                                                                                      \
+          {                                                                                         \
+            XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) && \
+                             ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= (kWidthMOD / 2)) &&       \
+                             (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kHeightMOD / 2)) &&             \
+                             (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1))),        \
+                            XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");    \
+          }                                                                                         \
+        }                                                                                           \
+        else                                                                                        \
+        {                                                                                           \
+          if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                \
+          {                                                                                         \
+            XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= (kWidthMOD / 2)) &&       \
+                             ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) && \
+                             (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) &&       \
+                             (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kHeightMOD / 2))),              \
+                            XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");    \
+          }                                                                                         \
+          else                                                                                      \
+          {                                                                                         \
+            XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) && \
+                             ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= (kWidthMOD / 2)) &&       \
+                             (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) &&       \
+                             (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kHeightMOD / 2))),              \
+                            XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");    \
+          }                                                                                         \
+        }                                                                                           \
+      }                                                                                             \
+    }                                                                                               \
+}
+#define XAI_CHECK_CONSISTENCY_DEPTHWISE_MOD_ID32WH(inTile, coeffT, outTile, param)                                                                                        \
+  {                                                                                                                                                                       \
+    uint16_t dilationX     = XAI_CNN_CONV_GET_DILATIONX(param);                                                                                                           \
+    uint16_t dilationY     = XAI_CNN_CONV_GET_DILATIONY(param);                                                                                                           \
+    int32_t dilatedkWidth  = dilationX * (XAI_TILE3D_GET_DIM2(coeffT) - 1) + 1;                                                                                           \
+    int32_t dilatedkHeight = dilationY * (XAI_TILE3D_GET_DIM3(coeffT) - 1) + 1;                                                                                           \
+    XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM2(inTile) << 5) == (XAI_TILE3D_GET_DIM2(outTile) << 5)),                                                                          \
+                    XAI_ERR_DATASIZE, "Number of input and output channel should be equal.");                                                                             \
+    if (dilatedkWidth % 2 != 0)                                                                                                                                           \
+    {                                                                                                                                                                     \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outTile) >> 5) <= ((((XAI_TILE3D_GET_DIM1(inTile) >> 5) +                                                                     \
+                                                                 (dilatedkWidth >> 1) + (dilatedkWidth >> 1) - dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \
+                      XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                                                \
+    }                                                                                                                                                                     \
+    else                                                                                                                                                                  \
+    {                                                                                                                                                                     \
+      XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outTile) >> 5) <= ((((XAI_TILE3D_GET_DIM1(inTile) >> 5) +                                                                     \
+                                                                 (dilatedkWidth >> 1) + ((dilatedkWidth >> 1) - 1) -                                                      \
+                                                                 dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),                                               \
+                      XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                                                \
+    }                                                                                                                                                                     \
+    if (dilatedkHeight % 2 != 0)                                                                                                                                          \
+    {                                                                                                                                                                     \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outTile) <= (((XAI_TILE3D_GET_DIM3(inTile) + (dilatedkHeight >> 1) +                                                           \
+                                                          (dilatedkHeight >> 1) - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),                             \
+                      XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                                               \
+    }                                                                                                                                                                     \
+    else                                                                                                                                                                  \
+    {                                                                                                                                                                     \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outTile) <= (((XAI_TILE3D_GET_DIM3(inTile) + (dilatedkHeight >> 1) +                                                           \
+                                                          ((dilatedkHeight >> 1) - 1) - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),                       \
+                      XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                                               \
+    }                                                                                                                                                                     \
+  }
+
+#define XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile)                               {                                                                                              \
+    if (XAI_CNN_CONV_GET_FLAG_RELU(param))                                                                                                                                         \
+    {                                                                                                                                                                              \
+      XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN(param) <= XAI_CNN_CONV_GET_RELU_MAX(param)), XAI_ERR_BADARG,                                                                      \
+                      "\nMinimum Value of RELU = %d,\nMaximum Value of RELU = %d , Min Limit should not be greater than Max Limit",                                                \
+                      XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param));                                                                                         \
+      if (XAI_TYPE_ELEMENT_TYPE(outTile->type) == XAI_U8)                                                                                                                          \
+      {                                                                                                                                                                            \
+        XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN(param) >= 0 &&                                                                                                                  \
+                         XAI_CNN_CONV_GET_RELU_MAX(param) <= UCHAR_MAX), XAI_ERR_BADARG,                                                                                           \
+                        "\nMinimum Value of RELU = %d, value should be greater than or equal to 0 \nMaximum Value of RELU = %d, value should be less than or equal to 255",        \
+                        XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param));                                                                                       \
+      }                                                                                                                                                                            \
+      else if (XAI_TYPE_ELEMENT_TYPE(outTile->type) == XAI_S8)                                                                                                                     \
+      {                                                                                                                                                                            \
+        XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN(param) >= SCHAR_MIN &&                                                                                                          \
+                         XAI_CNN_CONV_GET_RELU_MAX(param) <= SCHAR_MAX), XAI_ERR_BADARG,                                                                                           \
+                        "\nMinimum Value of RELU = %d, value should be greater than or equal to -128 \nMaximum Value of RELU = %d, value should be less than or equal to 127",     \
+                        XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param));                                                                                       \
+      }                                                                                                                                                                            \
+      else if (XAI_TYPE_ELEMENT_TYPE(outTile->type) == XAI_S16)                                                                                                                    \
+      {                                                                                                                                                                            \
+        XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN(param) >= SHRT_MIN &&                                                                                                           \
+                         XAI_CNN_CONV_GET_RELU_MAX(param) <= SHRT_MAX), XAI_ERR_BADARG,                                                                                            \
+                        "\nMinimum Value of RELU = %d, value should be greater than or equal to -32768 \nMaximum Value of RELU = %d, value should be less than or equal to 32767", \
+                        XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param));                                                                                       \
+      }                                                                                                                                                                            \
+      else if (XAI_TYPE_ELEMENT_TYPE(outTile->type) == XAI_U16)                                                                                                                    \
+      {                                                                                                                                                                            \
+        XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN(param) >= 0 &&                                                                                                                  \
+                         XAI_CNN_CONV_GET_RELU_MAX(param) <= USHRT_MAX), XAI_ERR_BADARG,                                                                                           \
+                        "\nMinimum Value of RELU = %d, value should be greater than or equal to 0 \nMaximum Value of RELU = %d, value should be less than or equal to 65535",      \
+                        XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param));                                                                                       \
+      }                                                                                                                                                                            \
+      else                                                                                                                                                                         \
+      {                                                                                                                                                                            \
+        XAI_CHECK_ERROR(0, XAI_ERR_NO_VARIANT, "Output tile datatype is not supported by XAI_CHECK_CONV_RELU_LIMITS_IX");                                                          \
+      }                                                                                                                                                                            \
+    }                                                                                                                                                                              \
+}
+
+#define XAI_CHECK_DEPTHWISE_DILATED_CONV_RELU_LIMITS_IX(param, outTile)             {                                                               \
+    if (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_RELU(param))                                                                                        \
+    {                                                                                                                                               \
+      XAI_CHECK_ERROR((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param) <= XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param)),                   \
+                      XAI_ERR_BADARG, "\nMinimum Value of RELU = %d,\nMaximum Value of RELU = %d , Min Limit should not be greater than Max Limit", \
+                      XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param), XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param));                      \
+      if (XAI_TYPE_ELEMENT_TYPE(outTile->type) == XAI_U8)                                                                                           \
+      {                                                                                                                                             \
+        XAI_CHECK_ERROR((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param) >= 0 &&                                                                 \
+                         XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param) <= UCHAR_MAX), XAI_ERR_BADARG,                                          \
+                        "\nMinimum Value of RELU = %d, value should be greater than or equal to 0 \nMaximum Value of RELU = %d,"                    \
+                        "value should be less than or equal to 255",                                                                                \
+                        XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param), XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param));                    \
+      }                                                                                                                                             \
+      else if (XAI_TYPE_ELEMENT_TYPE(outTile->type) == XAI_S8)                                                                                      \
+      {                                                                                                                                             \
+        XAI_CHECK_ERROR((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param) >= SCHAR_MIN &&                                                         \
+                         XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param) <= SCHAR_MAX), XAI_ERR_BADARG,                                          \
+                        "\nMinimum Value of RELU = %d, value should be greater than or equal to -128 \nMaximum Value of RELU = %d,"                 \
+                        "value should be less than or equal to 127",                                                                                \
+                        XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param), XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param));                    \
+      }                                                                                                                                             \
+      else if (XAI_TYPE_ELEMENT_TYPE(outTile->type) == XAI_S16)                                                                                     \
+      {                                                                                                                                             \
+        XAI_CHECK_ERROR((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param) >= SHRT_MIN &&                                                          \
+                         XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param) <= SHRT_MAX), XAI_ERR_BADARG,                                           \
+                        "\nMinimum Value of RELU = %d, value should be greater than or equal to -32768 \nMaximum Value of RELU = %d,"               \
+                        "value should be less than or equal to 32767",                                                                              \
+                        XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param), XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param));                    \
+      }                                                                                                                                             \
+      else if (XAI_TYPE_ELEMENT_TYPE(outTile->type) == XAI_U16)                                                                                     \
+      {                                                                                                                                             \
+        XAI_CHECK_ERROR((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param) >= 0 &&                                                                 \
+                         XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param) <= USHRT_MAX), XAI_ERR_BADARG,                                          \
+                        "\nMinimum Value of RELU = %d, value should be greater than or equal to 0 \nMaximum Value of RELU = %d,"                    \
+                        "value should be less than or equal to 65535",                                                                              \
+                        XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param), XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param));                    \
+      }                                                                                                                                             \
+      else                                                                                                                                          \
+      {                                                                                                                                             \
+        XAI_CHECK_ERROR(0, XAI_ERR_NO_VARIANT, "Output tile datatype is not supported by XAI_CHECK_DEPTHWISE_DILATED_CONV_RELU_LIMITS_IX");         \
+      }                                                                                                                                             \
+    }                                                                                                                                               \
+}
+
+#define XAI_CHECK_CONSISTENCY_DEPTHWISE_MOD_DWH(inT, coeffT, biasArr, outT, param)  {                                              \
+    int32_t KW_MOD = XAI_TILE3D_GET_DIM2(coeffT);                                                                                  \
+    int32_t KH_MOD = XAI_TILE3D_GET_DIM3(coeffT);                                                                                  \
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                     \
+                    "Number of Input Channels not equal to the number of channels in the Kernel.");                                \
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                    \
+                    "Number of Output Channels not equal to the number of channels in the Kernel.");                               \
+    if (KW_MOD % 2 != 0)                                                                                                           \
+    {                                                                                                                              \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1)                                    \
+                                                       + (KW_MOD >> 1) - KW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)),       \
+                      XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                         \
+    }                                                                                                                              \
+    else                                                                                                                           \
+    {                                                                                                                              \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1)                                    \
+                                                       + ((KW_MOD >> 1) - 1) - KW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \
+                      XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                         \
+    }                                                                                                                              \
+    if (KH_MOD % 2 != 0)                                                                                                           \
+    {                                                                                                                              \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1)                                    \
+                                                       + (KH_MOD >> 1) - KH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)),       \
+                      XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                        \
+    }                                                                                                                              \
+    else                                                                                                                           \
+    {                                                                                                                              \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1)                                    \
+                                                       + ((KH_MOD >> 1) - 1) - KH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \
+                      XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                        \
+    }                                                                                                                              \
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                 \
+                    "Width of Bias Array is less than number of channels in the Kernel.");                                         \
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArr) > 0, XAI_ERR_DATASIZE,                                                           \
+                    "Height of Bias Array should be greater than zero.");                                                          \
+}
+
+#if (((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+  #define XAI_CHECK_CONSISTENCY_F16_MOD_DWH(inT, coeffT, biasArr, outT, param)  {                                                        \
+    int32_t KW_MOD = (XAI_TILE3D_GET_DIM2(coeffT) - 1) * XAI_CNN_CONV_GET_DILATIONX(param) + 1;                                          \
+    int32_t KH_MOD = (XAI_TILE3D_GET_DIM3(coeffT) - 1) * XAI_CNN_CONV_GET_DILATIONY(param) + 1;                                          \
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inT) == XAI_TILE3D_GET_DIM2(coeffT), XAI_ERR_DATASIZE,                                           \
+                    "Number of Input Channels not equal to the number of channels in the Kernel.");                                      \
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                          \
+                    "Number of Output Channels not equal to the number of channels in the Kernel.");                                     \
+    if (KW_MOD % 2 != 0)                                                                                                                 \
+    {                                                                                                                                    \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1)                                          \
+                                                       + (KW_MOD >> 1) - KW_MOD) >> (XAI_CNN_CONV_GET_STRIDEX(param) >> 1)) + 1)),       \
+                      XAI_ERR_DATASIZE, "Output Width is invalid.");                                                                     \
+    }                                                                                                                                    \
+    else                                                                                                                                 \
+    {                                                                                                                                    \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1)                                          \
+                                                       + ((KW_MOD >> 1) - 1) - KW_MOD) >> (XAI_CNN_CONV_GET_STRIDEX(param) >> 1)) + 1)), \
+                      XAI_ERR_DATASIZE, "Output Width is invalid.");                                                                     \
+    }                                                                                                                                    \
+    if (KH_MOD % 2 != 0)                                                                                                                 \
+    {                                                                                                                                    \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1)                                          \
+                                                       + (KH_MOD >> 1) - KH_MOD) >> (XAI_CNN_CONV_GET_STRIDEY(param) >> 1)) + 1)),       \
+                      XAI_ERR_DATASIZE, "Output Height is invalid.");                                                                    \
+    }                                                                                                                                    \
+    else                                                                                                                                 \
+    {                                                                                                                                    \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1)                                          \
+                                                       + ((KH_MOD >> 1) - 1) - KH_MOD) >> (XAI_CNN_CONV_GET_STRIDEY(param) >> 1)) + 1)), \
+                      XAI_ERR_DATASIZE, "Output Height is invalid.");                                                                    \
+    }                                                                                                                                    \
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                       \
+                    "Width of Bias Array is less than number of channels in the Kernel.");                                               \
+    XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE,                                                               \
+                    "Height of Bias Array should be greater than zero.");                                                                \
+}
+#endif //if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION)))
+
+#define XAI_CHECK_CONSISTENCY_DEPTHWISE_DILATED_MOD_DWH(inT, coeffT, biasArr, outT, param)                                                         \
+  int32_t KW_MOD = (XAI_TILE3D_GET_DIM2(coeffT) - 1) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(param) + 1;                                    \
+  int32_t KH_MOD = (XAI_TILE3D_GET_DIM3(coeffT) - 1) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(param) + 1;                                    \
+  XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(inT) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DEPTH_MULTIPLIER(param))                                          \
+                  == XAI_TILE3D_GET_DIM1(coeffT),                                                                                                  \
+                  XAI_ERR_DATASIZE,                                                                                                                \
+                  "Number of Input Channels not equal to the number of channels in the Kernel.");                                                  \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                                      \
+                  "Number of Output Channels not equal to the number of channels in the Kernel.");                                                 \
+  if (KW_MOD % 2 != 0)                                                                                                                             \
+  {                                                                                                                                                \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1)                                                      \
+                                                     + (KW_MOD >> 1) - KW_MOD) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                           \
+  }                                                                                                                                                \
+  else                                                                                                                                             \
+  {                                                                                                                                                \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1)                                                      \
+                                                     + ((KW_MOD >> 1) - 1) - KW_MOD) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                           \
+  }                                                                                                                                                \
+  if (KH_MOD % 2 != 0)                                                                                                                             \
+  {                                                                                                                                                \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1)                                                      \
+                                                     + (KH_MOD >> 1) - KH_MOD) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                          \
+  }                                                                                                                                                \
+  else                                                                                                                                             \
+  {                                                                                                                                                \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1)                                                      \
+                                                     + ((KH_MOD >> 1) - 1) - KH_MOD) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                          \
+  }                                                                                                                                                \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                                   \
+                  "Width of Bias Array is less than number of channels in the Kernel.");                                                           \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArr) > 0, XAI_ERR_DATASIZE,                                                                             \
+                  "Height of Bias Array should be greater than zero.");
+
+#define XAI_CHECK_CONSISTENCY_DEPTHWISE_DILATED_VQ_MOD_DWH(inT, coeffT, biasArr, outputScaleArray, outT, param)                                    \
+  int32_t KW_MOD = (XAI_TILE3D_GET_DIM2(coeffT) - 1) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(param) + 1;                                    \
+  int32_t KH_MOD = (XAI_TILE3D_GET_DIM3(coeffT) - 1) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(param) + 1;                                    \
+  XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(inT) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DEPTH_MULTIPLIER(param))                                          \
+                  == XAI_TILE3D_GET_DIM1(coeffT),                                                                                                  \
+                  XAI_ERR_DATASIZE,                                                                                                                \
+                  "Number of Input Channels not equal to the number of channels in the Kernel.");                                                  \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                                      \
+                  "Number of Output Channels not equal to the number of channels in the Kernel.");                                                 \
+  if (KW_MOD % 2 != 0)                                                                                                                             \
+  {                                                                                                                                                \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1)                                                      \
+                                                     + (KW_MOD >> 1) - KW_MOD) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                           \
+  }                                                                                                                                                \
+  else                                                                                                                                             \
+  {                                                                                                                                                \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1)                                                      \
+                                                     + ((KW_MOD >> 1) - 1) - KW_MOD) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                           \
+  }                                                                                                                                                \
+  if (KH_MOD % 2 != 0)                                                                                                                             \
+  {                                                                                                                                                \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1)                                                      \
+                                                     + (KH_MOD >> 1) - KH_MOD) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                          \
+  }                                                                                                                                                \
+  else                                                                                                                                             \
+  {                                                                                                                                                \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1)                                                      \
+                                                     + ((KH_MOD >> 1) - 1) - KH_MOD) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                          \
+  }                                                                                                                                                \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                                   \
+                  "Width of Bias Array is less than number of channels in the Kernel.");                                                           \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArr) > 0, XAI_ERR_DATASIZE,                                                                             \
+                  "Height of Bias Array should be greater than zero.");                                                                            \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE,                                          \
+                  "Width of Bias Array is less than number of channels in the Kernel.");                                                           \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(outputScaleArray) > 0, XAI_ERR_DATASIZE,                                                                    \
+                  "Height of Bias Array should be greater than zero.");
+
+#define XAI_CHECK_CONSISTENCY_DEPTHWISE_DILATED_VQ_MOW_WHD(inT, coeffT, biasArr, outputScaleArray, outT, param)                                   \
+  int32_t KW_MOW = (XAI_TILE3D_GET_DIM1(coeffT) - 1) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(param) + 1;                                   \
+  int32_t KH_MOW = (XAI_TILE3D_GET_DIM2(coeffT) - 1) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(param) + 1;                                   \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inT) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DEPTH_MULTIPLIER(param)                                           \
+                  == XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE,                                                                               \
+                  "Number of Input Channels not equal to the number of channels in the Kernel.");                                                 \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outT) == XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE,                                                     \
+                  "Number of Output Channels not equal to the number of channels in the Kernel.");                                                \
+  if (KW_MOW % 2 != 0)                                                                                                                            \
+  {                                                                                                                                               \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (KW_MOW >> 1)                                                     \
+                                                     + (KW_MOW >> 1) - KW_MOW) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                          \
+  }                                                                                                                                               \
+  else                                                                                                                                            \
+  {                                                                                                                                               \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (KW_MOW >> 1)                                                     \
+                                                     + ((KW_MOW >> 1) - 1) - KW_MOW) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                                          \
+  }                                                                                                                                               \
+  if (KH_MOW % 2 != 0)                                                                                                                            \
+  {                                                                                                                                               \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KH_MOW >> 1)                                                     \
+                                                     + (KH_MOW >> 1) - KH_MOW) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                         \
+  }                                                                                                                                               \
+  else                                                                                                                                            \
+  {                                                                                                                                               \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KH_MOW >> 1)                                                     \
+                                                     + ((KH_MOW >> 1) - 1) - KH_MOW) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                                         \
+  }                                                                                                                                               \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE,                                                  \
+                  "Width of Bias Array is less than number of channels in the Kernel.");                                                          \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArr) > 0, XAI_ERR_DATASIZE,                                                                            \
+                  "Height of Bias Array should be greater than zero.");                                                                           \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE,                                         \
+                  "Width of Bias Array is less than number of channels in the Kernel.");                                                          \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(outputScaleArray) > 0, XAI_ERR_DATASIZE,                                                                   \
+                  "Height of Bias Array should be greater than zero.");
+
+#define XAI_CHECK_CONSISTENCY_DEPTHWISE_MOW_WHD(inT, coeffT, biasArr, outT, param)                                            \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inT) == XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE,                                  \
+                  "Number of Input Channels not equal to the number of channels in the Kernel.");                             \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outT) == XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE,                                 \
+                  "Number of Output Channels not equal to the number of channels in the Kernel.");                            \
+  int32_t kW_MOW = XAI_TILE3D_GET_DIM1(coeffT);                                                                               \
+  int32_t kH_MOW = XAI_TILE3D_GET_DIM2(coeffT);                                                                               \
+  if (kW_MOW % 2 != 0)                                                                                                        \
+  {                                                                                                                           \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (kW_MOW >> 1) +                               \
+                                                     (kW_MOW >> 1) - kW_MOW) / (XAI_CNN_CONV_GET_STRIDE(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                      \
+  }                                                                                                                           \
+  else                                                                                                                        \
+  {                                                                                                                           \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (kW_MOW >> 1) +                               \
+                                                     ((kW_MOW >> 1) - 1) - kW_MOW) / (XAI_CNN_CONV_GET_STRIDE(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent.");                                      \
+  }                                                                                                                           \
+  if (kH_MOW % 2 != 0)                                                                                                        \
+  {                                                                                                                           \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (kH_MOW >> 1) +                               \
+                                                     (kH_MOW >> 1) - kH_MOW) / (XAI_CNN_CONV_GET_STRIDE(param))) + 1)),       \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                     \
+  }                                                                                                                           \
+  else                                                                                                                        \
+  {                                                                                                                           \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (kH_MOW >> 1) +                               \
+                                                     ((kH_MOW >> 1) - 1) - kH_MOW) / (XAI_CNN_CONV_GET_STRIDE(param))) + 1)), \
+                    XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.");                                     \
+  }                                                                                                                           \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE,                              \
+                  "Width of Bias Array is less than number of channels in the Kernel.");                                      \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArr) > 0, XAI_ERR_DATASIZE,                                                        \
+                  "Height of Bias Array should be greater than zero.");
+
+#define XAI_CHECK_KERNEL_SIZE_DEPTHWISE(coeffT, size)                                               \
+  if (XAI_TILE3D_GET_DATA_ORDER(coeffT) == XAI_WHD)                                                 \
+  {                                                                                                 \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(coeffT) == size) && (XAI_TILE3D_GET_DIM2(coeffT) == size), \
+                    XAI_ERR_KSIZE, "The Coefficient Kernel Size is not supported");                 \
+  }                                                                                                 \
+  else if (XAI_TILE3D_GET_DATA_ORDER(coeffT) == XAI_DWH)                                            \
+  {                                                                                                 \
+    XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(coeffT) == size) && (XAI_TILE3D_GET_DIM3(coeffT) == size), \
+                    XAI_ERR_KSIZE, "The Coefficient Kernel Size is not supported");                 \
+  }
+
+#define XAI_CHECK_EDGES_DEPTHWISE_MOW_WHD(inTile, coeffTile, param)                            \
+  int32_t kW = XAI_TILE3D_GET_DIM1(coeffTile);                                                 \
+  int32_t kH = XAI_TILE3D_GET_DIM2(coeffTile);                                                 \
+  if (kW % 2 != 0)                                                                             \
+  {                                                                                            \
+    if (kH % 2 != 0)                                                                           \
+    {                                                                                          \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= kW / 2)                            \
+                      && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= kW / 2)                         \
+                      && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kH / 2)                         \
+                      && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kH / 2),                        \
+                      XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");     \
+    }                                                                                          \
+    else                                                                                       \
+    {                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= kW / 2)                          \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= kW / 2)                       \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kH / 2)                       \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kH / 2) - 1)),              \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= kW / 2)                          \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= kW / 2)                       \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kH / 2) - 1))               \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kH / 2),                      \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+    }                                                                                          \
+  }                                                                                            \
+  else                                                                                         \
+  {                                                                                            \
+    if (kH % 2 != 0)                                                                           \
+    {                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                               \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= kW / 2)                          \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((kW / 2) - 1))               \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kH / 2)                       \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kH / 2),                      \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((kW / 2) - 1))                  \
+                        && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= kW / 2)                       \
+                        && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kH / 2)                       \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kH / 2),                      \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+    }                                                                                          \
+    else                                                                                       \
+    {                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                \
+      {                                                                                        \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                             \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (kW / 2) &&                    \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((kW / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kH / 2) &&                    \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kH / 2) - 1)),               \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+        else                                                                                   \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((kW / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (kW / 2) &&                    \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kH / 2) &&                    \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kH / 2) - 1)),               \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                             \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (kW / 2) &&                    \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (kW / 2 - 1) &&                \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kH / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kH / 2)),                     \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+        else                                                                                   \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((kW / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (kW / 2) &&                    \
+                           XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kH / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kH / 2)),                     \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+      }                                                                                        \
+    }                                                                                          \
+  }
+
+#define XAI_CHECK_EDGES_DEPTHWISE_MOD_DWH(inTile, coeffTile, param)                            \
+  int32_t kW = XAI_TILE3D_GET_DIM2(coeffTile);                                                 \
+  int32_t kH = XAI_TILE3D_GET_DIM3(coeffTile);                                                 \
+  if (kW % 2 != 0)                                                                             \
+  {                                                                                            \
+    if (kH % 2 != 0)                                                                           \
+    {                                                                                          \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2)                            \
+                      && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2)                         \
+                      && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2)                         \
+                      && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2),                        \
+                      XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");     \
+    }                                                                                          \
+    else                                                                                       \
+    {                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2)                          \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)),              \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2)                          \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1))               \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2),                      \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+    }                                                                                          \
+  }                                                                                            \
+  else                                                                                         \
+  {                                                                                            \
+    if (kH % 2 != 0)                                                                           \
+    {                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                               \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2)                          \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) - 1)                 \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2),                      \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1))                  \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2)                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2),                      \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");   \
+      }                                                                                        \
+    }                                                                                          \
+    else                                                                                       \
+    {                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                \
+      {                                                                                        \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                             \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kW / 2) &&                    \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kW / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kH / 2) &&                    \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)),               \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+        else                                                                                   \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) &&                    \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kH / 2) &&                    \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)),               \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+      }                                                                                        \
+      else                                                                                     \
+      {                                                                                        \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                             \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kW / 2) &&                    \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kW / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kH / 2)),                     \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+        else                                                                                   \
+        {                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) &&                    \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1) &&              \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kH / 2)),                     \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \
+        }                                                                                      \
+      }                                                                                        \
+    }                                                                                          \
+  }
+
+#define XAI_CHECK_EDGES_DEPTHWISE_DILATED_MOD_DWH(inTile, coeffTile, param)                                    \
+  int32_t kW = (XAI_TILE3D_GET_DIM2(coeffTile) - 1) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(param) + 1; \
+  int32_t kH = (XAI_TILE3D_GET_DIM3(coeffTile) - 1) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(param) + 1; \
+  if (kW % 2 != 0)                                                                                             \
+  {                                                                                                            \
+    if (kH % 2 != 0)                                                                                           \
+    {                                                                                                          \
+      XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2)                                            \
+                      && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2)                                         \
+                      && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2)                                         \
+                      && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2),                                        \
+                      XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                     \
+    }                                                                                                          \
+    else                                                                                                       \
+    {                                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                                \
+      {                                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2)                                          \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2)                                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2)                                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)),                              \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                   \
+      }                                                                                                        \
+      else                                                                                                     \
+      {                                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2)                                          \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2)                                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1))                               \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2),                                      \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                   \
+      }                                                                                                        \
+    }                                                                                                          \
+  }                                                                                                            \
+  else                                                                                                         \
+  {                                                                                                            \
+    if (kH % 2 != 0)                                                                                           \
+    {                                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                               \
+      {                                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2)                                          \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) - 1)                                 \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2)                                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2),                                      \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                   \
+      }                                                                                                        \
+      else                                                                                                     \
+      {                                                                                                        \
+        XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1))                                  \
+                        && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2)                                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2)                                       \
+                        && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2),                                      \
+                        XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                   \
+      }                                                                                                        \
+    }                                                                                                          \
+    else                                                                                                       \
+    {                                                                                                          \
+      if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param))                                                                \
+      {                                                                                                        \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                             \
+        {                                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kW / 2) &&                                    \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kW / 2) - 1) &&                              \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kH / 2) &&                                    \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)),                               \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                 \
+        }                                                                                                      \
+        else                                                                                                   \
+        {                                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1) &&                              \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) &&                                    \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kH / 2) &&                                    \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)),                               \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                 \
+        }                                                                                                      \
+      }                                                                                                        \
+      else                                                                                                     \
+      {                                                                                                        \
+        if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param))                                                             \
+        {                                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kW / 2) &&                                    \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kW / 2) - 1) &&                              \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1) &&                              \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kH / 2)),                                     \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                 \
+        }                                                                                                      \
+        else                                                                                                   \
+        {                                                                                                      \
+          XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1) &&                              \
+                           XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) &&                                    \
+                           XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1) &&                              \
+                           XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kH / 2)),                                     \
+                          XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data");                 \
+        }                                                                                                      \
+      }                                                                                                        \
+    }                                                                                                          \
+  }
+
+#define XAI_CHECK_ROI_POOLING_PARAMS(param)                                                                                                                      \
+  XAI_CHECK_ERROR(((XAI_CNN_ROI_POOLING_GET_SPATIAL_SCALEX(param) <= 32767) && (XAI_CNN_ROI_POOLING_GET_SPATIAL_SCALEY(param) <= 32767)),                        \
+                  XAI_ERR_NORM, "spatialScaleX & spatialScaleY should be less than U15_MAX");                                                                    \
+  XAI_CHECK_ERROR(((XAI_CNN_ROI_POOLING_GET_ONE_BY_POOLED_WIDTH_SCALE(param) <= 32767) && (XAI_CNN_ROI_POOLING_GET_ONE_BY_POOLED_HEIGHT_SCALE(param) <= 32767)), \
+                  XAI_ERR_NORM, "oneByPooledWidth & oneByPooledHeight should be less than U15_MAX");                                                             \
+  XAI_CHECK_ERROR(((XAI_CNN_ROI_POOLING_GET_SPATIAL_SCALE_SHIFTX(param) < 32) && (XAI_CNN_ROI_POOLING_GET_SPATIAL_SCALE_SHIFTY(param) < 32)),                    \
+                  XAI_ERR_NORM, "spatialScaleShiftX & spatialScaleShiftY should be less than 32 (scalar shift value)");                                          \
+  XAI_CHECK_ERROR(((XAI_CNN_ROI_POOLING_GET_ONE_BY_POOLED_WIDTH_SHIFT(param) < 32) && (XAI_CNN_ROI_POOLING_GET_ONE_BY_POOLED_HEIGHT_SHIFT(param) < 32)),         \
+                  XAI_ERR_NORM, "shiftPool should be less than 32 (scalar shift value)");                                                                        \
+
+#define XAI_CHECK_REORG_PARAMS_DWH(inTile, outTile, params)                                         \
+  if (XAI_CNN_REORG_GET_REVERSE(params))                                                            \
+  {                                                                                                 \
+    XAI_CHECK_ERROR(XAI_CNN_REORG_GET_STRIDE(params) * XAI_CNN_REORG_GET_STRIDE(params) *           \
+                    XAI_TILE3D_GET_DIM1(outTile) == XAI_TILE3D_GET_DIM1(inTile),                    \
+                    XAI_ERR_DATASIZE, "The depth dimension of inTile and outTile is inconsistent"); \
+                                                                                                    \
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTile) == XAI_TILE3D_GET_DIM2(inTile) *                   \
+                    XAI_CNN_REORG_GET_STRIDE(params), XAI_ERR_DATASIZE,                             \
+                    "The width dimension of inTile and outTile is inconsistent");                   \
+                                                                                                    \
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTile) == XAI_TILE3D_GET_DIM3(inTile) *                   \
+                    XAI_CNN_REORG_GET_STRIDE(params), XAI_ERR_DATASIZE,                             \
+                    "The height dimension of inTile and outTile is inconsistent");                  \
+  }                                                                                                 \
+  else                                                                                              \
+  {                                                                                                 \
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTile) == XAI_CNN_REORG_GET_STRIDE(params) *              \
+                    XAI_CNN_REORG_GET_STRIDE(params) * XAI_TILE3D_GET_DIM1(inTile),                 \
+                    XAI_ERR_DATASIZE, "The depth dimension of inTile and outTile is inconsistent"); \
+                                                                                                    \
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTile) * XAI_CNN_REORG_GET_STRIDE(params) ==              \
+                    XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE,                                  \
+                    "The width dimension of inTile and outTile is inconsistent");                   \
+                                                                                                    \
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTile) * XAI_CNN_REORG_GET_STRIDE(params) ==              \
+                    XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE,                                  \
+                    "The height dimension of inTile and outTile is inconsistent");                  \
+  }
+
+#define XAI_CHECK_REORG4D_PARAMS_WHDN(inTile, outTile, params)                                      \
+  if (XAI_CNN_REORG_GET_REVERSE(params))                                                            \
+  {                                                                                                 \
+    XAI_CHECK_ERROR(XAI_CNN_REORG4D_GET_STRIDEX(params) * XAI_CNN_REORG4D_GET_STRIDEY(params) *     \
+                    XAI_TILE4D_GET_DIM4(outTile) == XAI_TILE4D_GET_DIM4(inTile),                    \
+                    XAI_ERR_DATASIZE, "The batch dimension of inTile and outTile is inconsistent"); \
+                                                                                                    \
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(outTile) == XAI_TILE4D_GET_DIM1(inTile) *                   \
+                    XAI_CNN_REORG4D_GET_STRIDEX(params), XAI_ERR_DATASIZE,                          \
+                    "The width dimension of inTile and outTile is inconsistent");                   \
+                                                                                                    \
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(outTile) == XAI_TILE4D_GET_DIM2(inTile) *                   \
+                    XAI_CNN_REORG4D_GET_STRIDEY(params), XAI_ERR_DATASIZE,                          \
+                    "The height dimension of inTile and outTile is inconsistent");                  \
+  }                                                                                                 \
+  else                                                                                              \
+  {                                                                                                 \
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM4(outTile) == XAI_CNN_REORG4D_GET_STRIDEX(params) *           \
+                    XAI_CNN_REORG4D_GET_STRIDEY(params) * XAI_TILE4D_GET_DIM4(inTile),              \
+                    XAI_ERR_DATASIZE, "The batch dimension of inTile and outTile is inconsistent"); \
+                                                                                                    \
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(outTile) * XAI_CNN_REORG4D_GET_STRIDEX(params) ==           \
+                    XAI_TILE4D_GET_DIM1(inTile), XAI_ERR_DATASIZE,                                  \
+                    "The width dimension of inTile and outTile is inconsistent");                   \
+                                                                                                    \
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(outTile) * XAI_CNN_REORG4D_GET_STRIDEY(params) ==           \
+                    XAI_TILE4D_GET_DIM2(inTile), XAI_ERR_DATASIZE,                                  \
+                    "The height dimension of inTile and outTile is inconsistent");                  \
+  }
+#define XAI_CHECK_REORG4D_PARAMS_DWHN(inTile, outTile, params)                                      \
+  if (XAI_CNN_REORG_GET_REVERSE(params))                                                            \
+  {                                                                                                 \
+    XAI_CHECK_ERROR(XAI_CNN_REORG4D_GET_STRIDEX(params) * XAI_CNN_REORG4D_GET_STRIDEY(params) *     \
+                    XAI_TILE4D_GET_DIM4(outTile) == XAI_TILE4D_GET_DIM4(inTile),                    \
+                    XAI_ERR_DATASIZE, "The batch dimension of inTile and outTile is inconsistent"); \
+                                                                                                    \
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(outTile) == XAI_TILE4D_GET_DIM2(inTile) *                   \
+                    XAI_CNN_REORG4D_GET_STRIDEX(params), XAI_ERR_DATASIZE,                          \
+                    "The width dimension of inTile and outTile is inconsistent");                   \
+                                                                                                    \
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM3(outTile) == XAI_TILE4D_GET_DIM3(inTile) *                   \
+                    XAI_CNN_REORG4D_GET_STRIDEY(params), XAI_ERR_DATASIZE,                          \
+                    "The height dimension of inTile and outTile is inconsistent");                  \
+  }                                                                                                 \
+  else                                                                                              \
+  {                                                                                                 \
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM4(outTile) == XAI_CNN_REORG4D_GET_STRIDEX(params) *           \
+                    XAI_CNN_REORG4D_GET_STRIDEY(params) * XAI_TILE4D_GET_DIM4(inTile),              \
+                    XAI_ERR_DATASIZE, "The batch dimension of inTile and outTile is inconsistent"); \
+                                                                                                    \
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(outTile) * XAI_CNN_REORG4D_GET_STRIDEX(params) ==           \
+                    XAI_TILE4D_GET_DIM2(inTile), XAI_ERR_DATASIZE,                                  \
+                    "The width dimension of inTile and outTile is inconsistent");                   \
+                                                                                                    \
+    XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM3(outTile) * XAI_CNN_REORG4D_GET_STRIDEY(params) ==           \
+                    XAI_TILE4D_GET_DIM3(inTile), XAI_ERR_DATASIZE,                                  \
+                    "The height dimension of inTile and outTile is inconsistent");                  \
+  }
+
+#define XAI_CHECK_REORG_PARAMS_WHD(inT, outT, param)                                                  \
+  if (XAI_CNN_REORG_GET_REVERSE(param))                                                               \
+  {                                                                                                   \
+    XAI_CHECK_ERROR((XAI_CNN_REORG_GET_STRIDE(param) * XAI_CNN_REORG_GET_STRIDE(param) *              \
+                     XAI_TILE3D_GET_DIM3(outT)) == XAI_TILE3D_GET_DIM3(inT), XAI_ERR_DATASIZE,        \
+                    "Number of output channels is strideX * strideY times number of input channels"); \
+                                                                                                      \
+    XAI_CHECK_ERROR(XAI_CNN_REORG_GET_STRIDE(param) * XAI_TILE3D_GET_DIM1(inT) ==                     \
+                    XAI_TILE3D_GET_DIM1(outT), XAI_ERR_DATASIZE,                                      \
+                    "Input width is strideX times output width");                                     \
+                                                                                                      \
+    XAI_CHECK_ERROR(XAI_CNN_REORG_GET_STRIDE(param) * XAI_TILE3D_GET_DIM2(inT) ==                     \
+                    XAI_TILE3D_GET_DIM2(outT), XAI_ERR_DATASIZE,                                      \
+                    "Input height is strideY times output height");                                   \
+                                                                                                      \
+  }                                                                                                   \
+  else                                                                                                \
+  {                                                                                                   \
+    XAI_CHECK_ERROR((XAI_CNN_REORG_GET_STRIDE(param) * XAI_CNN_REORG_GET_STRIDE(param) *              \
+                     XAI_TILE3D_GET_DIM3(inT)) == XAI_TILE3D_GET_DIM3(outT), XAI_ERR_DATASIZE,        \
+                    "Number of output channels is strideX * strideY times number of input channels"); \
+                                                                                                      \
+    XAI_CHECK_ERROR(XAI_CNN_REORG_GET_STRIDE(param) * XAI_TILE3D_GET_DIM1(outT) ==                    \
+                    XAI_TILE3D_GET_DIM1(inT), XAI_ERR_DATASIZE,                                       \
+                    "Input width is strideX times output width");                                     \
+                                                                                                      \
+    XAI_CHECK_ERROR(XAI_CNN_REORG_GET_STRIDE(param) * XAI_TILE3D_GET_DIM2(outT) ==                    \
+                    XAI_TILE3D_GET_DIM2(inT), XAI_ERR_DATASIZE,                                       \
+                    "Input height is strideY times output height");                                   \
+  }
+
+#if XAI_ERROR_LEVEL != XAI_ERROR_LEVEL_NO_ERROR
+#define XAI_CHECK_INTERP_BOUNDARY(xDstCoordinate, yDstCoordinate, zDstCoordinate, xSrcCoordinate, ySrcCoordinate, zSrcCoordinate,                        \
+                                  xScale, yScale, xShift, yShift, inDataWidth, inDataHeight, inDataDepth, outDataWidth, outDataHeight, outDataDepth,     \
+                                  edge1AcrossWidth, edge2AcrossWidth, edge1AcrossHeight, edge2AcrossHeight,                                              \
+                                  inFrameWidth, inFrameHeight)                                                                                           \
+  {                                                                                                                                                      \
+    int32_t insideFrameX;                                                                                                                                \
+    int32_t insideFrameY;                                                                                                                                \
+                                                                                                                                                         \
+    int32_t xmax = (((xDstCoordinate + outDataWidth - 1) * xScale + xShift) >> 18) + 1;                                                                  \
+    int32_t ymax = (((yDstCoordinate + outDataHeight - 1) * yScale + yShift) >> 18) + 1;                                                                 \
+    int32_t zmax = (zDstCoordinate + outDataDepth);                                                                                                      \
+                                                                                                                                                         \
+    insideFrameX = (xmax < inFrameWidth);                                                                                                                \
+    insideFrameY = (ymax < inFrameHeight);                                                                                                               \
+                                                                                                                                                         \
+    XAI_CHECK_ERROR(((((xDstCoordinate * xScale + xShift < 0) || ((xDstCoordinate * xScale + xShift) >> 18) >= (xSrcCoordinate - edge1AcrossWidth))) &&  \
+                     (((yDstCoordinate * yScale + yShift < 0) || ((yDstCoordinate * yScale + yShift) >> 18) >= (ySrcCoordinate - edge1AcrossHeight))) && \
+                     (((zDstCoordinate) >= (zSrcCoordinate))) &&                                                                                         \
+                     (((xmax + insideFrameX) <= (xSrcCoordinate + inDataWidth + edge2AcrossWidth))) &&                                                   \
+                     (((ymax + insideFrameY) <= (ySrcCoordinate + inDataHeight + edge2AcrossHeight))) &&                                                 \
+                     ((zmax <= (zSrcCoordinate + inDataDepth)))),                                                                                        \
+                    XAI_ERR_DATASIZE, "The input tile size requirements is in sufficient");                                                              \
+  }
+#else
+#define XAI_CHECK_INTERP_BOUNDARY(xDstCoordinate, yDstCoordinate, zDstCoordinate, xSrcCoordinate, ySrcCoordinate, zSrcCoordinate,                    \
+                                  xScale, yScale, xShift, yShift, inDataWidth, inDataHeight, inDataDepth, outDataWidth, outDataHeight, outDataDepth, \
+                                  edge1AcrossWidth, edge2AcrossWidth, edge1AcrossHeight, edge2AcrossHeight,                                          \
+                                  inFrameWidth, inFrameHeight)
+#endif
+
+#if XAI_ERROR_LEVEL != XAI_ERROR_LEVEL_NO_ERROR
+#define XAI_CHECK_RESIZENEAREST_BOUNDARY(xDstCoordinate, yDstCoordinate, zDstCoordinate,               \
+                                         xSrcCoordinate, ySrcCoordinate, zSrcCoordinate,               \
+                                         xScale, yScale, xShift, yShift,                               \
+                                         inDataWidth, inDataHeight, inDataDepth,                       \
+                                         outDataWidth, outDataHeight, outDataDepth,                    \
+                                         edge1AcrossWidth, edge2AcrossWidth, edge1AcrossHeight,        \
+                                         edge2AcrossHeight, inFrameWidth, inFrameHeight)               \
+  {                                                                                                    \
+    int32_t xmin = ((xDstCoordinate * xScale) + xShift);                                               \
+    int32_t ymin = ((yDstCoordinate * yScale) + yShift);                                               \
+    int32_t zmin = (zDstCoordinate);                                                                   \
+    int32_t xmax = (((xDstCoordinate + outDataWidth - 1) * xScale + xShift) >> 18) + 1;                \
+    int32_t ymax = (((yDstCoordinate + outDataHeight - 1) * yScale + yShift) >> 18) + 1;               \
+    int32_t zmax = (zDstCoordinate + outDataDepth);                                                    \
+                                                                                                       \
+    int32_t insideFrameX = (xmax < inFrameWidth);                                                      \
+    int32_t insideFrameY = (ymax < inFrameHeight);                                                     \
+                                                                                                       \
+    XAI_CHECK_ERROR((((xmin < 0 || (xmin >> 18) >= (xSrcCoordinate - edge1AcrossWidth))) &&            \
+                     ((ymin < 0 || (ymin >> 18) >= (ySrcCoordinate - edge1AcrossHeight))) &&           \
+                     (zmin >= (zSrcCoordinate)) &&                                                     \
+                     ((xmax + insideFrameX) <= (xSrcCoordinate + inDataWidth + edge2AcrossWidth)) &&   \
+                     ((ymax + insideFrameY) <= (ySrcCoordinate + inDataHeight + edge2AcrossHeight)) && \
+                     (zmax <= (zSrcCoordinate + inDataDepth))),                                        \
+                    XAI_ERR_DATASIZE, "The input tile size requirements is in sufficient");            \
+  }
+#else
+#define XAI_CHECK_RESIZENEAREST_BOUNDARY(xDstCoordinate, yDstCoordinate, zDstCoordinate,        \
+                                         xSrcCoordinate, ySrcCoordinate, zSrcCoordinate,        \
+                                         xScale, yScale, xShift, yShift,                        \
+                                         inDataWidth, inDataHeight, inDataDepth,                \
+                                         outDataWidth, outDataHeight, outDataDepth,             \
+                                         edge1AcrossWidth, edge2AcrossWidth, edge1AcrossHeight, \
+                                         edge2AcrossHeight, inFrameWidth, inFrameHeight)
+#endif
+
+#define XAI_CHECK_CONSISTENCY_MAXVALARR8(maxValArr, params, tileFlag)                                       \
+  {                                                                                                         \
+    if (XAI_CNN_MAXVAL_GET_TILEFLAG(params) != tileFlag)                                                    \
+    {                                                                                                       \
+      XAI_CHECK_ARRAY_S8(maxValArr);                                                                        \
+      XAI_CHECK_ERROR((XAI_ARRAY_GET_WIDTH(maxValArr) >= XCHAL_IVPN_SIMD_WIDTH),                            \
+                      XAI_ERR_BADARG, "Length of maxValArr should not be less than XCHAL_IVPN_SIMD_WIDTH"); \
+      XAI_CHECK_ERROR((XAI_ARRAY_GET_HEIGHT(maxValArr) > 0), XAI_ERR_BADARG,                                \
+                      "maxValArr height parameter is not set as required");                                 \
+    }                                                                                                       \
+  }
+#define XAI_CHECK_CONSISTENCY_MAXVALARR(maxValArr, params, tileFlag)                                        \
+  {                                                                                                         \
+    if (XAI_CNN_MAXVAL_GET_TILEFLAG(params) != tileFlag)                                                    \
+    {                                                                                                       \
+      XAI_CHECK_ARRAY_S16(maxValArr);                                                                       \
+      XAI_CHECK_ERROR((XAI_ARRAY_GET_WIDTH(maxValArr) >= XCHAL_IVPN_SIMD_WIDTH),                            \
+                      XAI_ERR_BADARG, "Length of maxValArr should not be less than XCHAL_IVPN_SIMD_WIDTH"); \
+      XAI_CHECK_ERROR((XAI_ARRAY_GET_HEIGHT(maxValArr) > 0), XAI_ERR_BADARG,                                \
+                      "maxValArr height parameter is not set as required");                                 \
+    }                                                                                                       \
+  }
+
+#define XAI_CHECK_PERMUTE_PARAMS(params)                                                                   \
+  XAI_CHECK_ERROR((XAI_CNN_PERMUTE4D_GET_ORDER1(params) > 0 && XAI_CNN_PERMUTE4D_GET_ORDER2(params) > 0 && \
+                   XAI_CNN_PERMUTE4D_GET_ORDER3(params) > 0 && XAI_CNN_PERMUTE4D_GET_ORDER4(params) > 0),  \
+                  XAI_ERR_BADARG, "The order should be greater than 0");                                   \
+  XAI_CHECK_ERROR((XAI_CNN_PERMUTE4D_GET_ORDER1(params) < 5 && XAI_CNN_PERMUTE4D_GET_ORDER2(params) < 5 && \
+                   XAI_CNN_PERMUTE4D_GET_ORDER3(params) < 5 && XAI_CNN_PERMUTE4D_GET_ORDER4(params) < 5),  \
+                  XAI_ERR_BADARG, "The order should be greater than 0");                                   \
+  XAI_CHECK_ERROR(((XAI_CNN_PERMUTE4D_GET_ORDER1(params) != XAI_CNN_PERMUTE4D_GET_ORDER2(params)) &&       \
+                   (XAI_CNN_PERMUTE4D_GET_ORDER1(params) != XAI_CNN_PERMUTE4D_GET_ORDER3(params)) &&       \
+                   (XAI_CNN_PERMUTE4D_GET_ORDER1(params) != XAI_CNN_PERMUTE4D_GET_ORDER4(params)) &&       \
+                   (XAI_CNN_PERMUTE4D_GET_ORDER2(params) != XAI_CNN_PERMUTE4D_GET_ORDER3(params)) &&       \
+                   (XAI_CNN_PERMUTE4D_GET_ORDER2(params) != XAI_CNN_PERMUTE4D_GET_ORDER4(params)) &&       \
+                   (XAI_CNN_PERMUTE4D_GET_ORDER3(params) != XAI_CNN_PERMUTE4D_GET_ORDER4(params))),        \
+                  XAI_ERR_BADARG, "The order values should not be equal to one another");
+
+#if XAI_ERROR_LEVEL != XAI_ERROR_LEVEL_NO_ERROR
+#define XAI_CHECK_CONSISTENCY_PERMUTE(inT, outT, params)                                                                  \
+  {                                                                                                                       \
+    uint8_t order[4] = { XAI_CNN_PERMUTE4D_GET_ORDER1(params),                                                            \
+                         XAI_CNN_PERMUTE4D_GET_ORDER2(params),                                                            \
+                         XAI_CNN_PERMUTE4D_GET_ORDER3(params),                                                            \
+                         XAI_CNN_PERMUTE4D_GET_ORDER4(params) };                                                          \
+    int32_t inDim[4] = { XAI_TILE4D_GET_DIM1(inT),                                                                        \
+                         XAI_TILE4D_GET_DIM2(inT),                                                                        \
+                         XAI_TILE4D_GET_DIM3(inT),                                                                        \
+                         XAI_TILE4D_GET_DIM4(inT) };                                                                      \
+                                                                                                                          \
+    const int32_t transposedDim1 = inDim[order[0] - 1];                                                                   \
+    const int32_t transposedDim2 = inDim[order[1] - 1];                                                                   \
+    const int32_t transposedDim3 = inDim[order[2] - 1];                                                                   \
+    const int32_t transposedDim4 = inDim[order[3] - 1];                                                                   \
+    XAI_CHECK_ERROR((transposedDim1 == XAI_TILE4D_GET_DIM1(outT) && transposedDim2 == XAI_TILE4D_GET_DIM2(outT)           \
+                     && transposedDim3 == XAI_TILE4D_GET_DIM3(outT) && transposedDim4 == XAI_TILE4D_GET_DIM4(outT)),      \
+                    XAI_ERR_DATASIZE, "The dimensions of the output tile should be equal to the transposed dimensions of the \
+                        input tile whose order is specified by the parameter in the xai_cnn_permute4D_params structure"); \
+  }
+#else
+#define XAI_CHECK_CONSISTENCY_PERMUTE(inT, outT, params)
+#endif
+#endif
+
+#define XAI_CHECK_CONSISTENCY_ARGMAX_3D_DIM1(inTile, outTileIdx, outTileVal, numLargestVal)             \
+  {                                                                                                     \
+    if (outTileIdx != NULL)                                                                             \
+    {                                                                                                   \
+      XAI_CHECK_TILE3D_U16(outTileIdx);                                                                 \
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileIdx);                                                    \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileIdx);                                            \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileIdx) == numLargestVal, XAI_ERR_DATASIZE,               \
+                      "Output index tile size is incorrect");                                           \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileIdx) == XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \
+                      "Output index tile size is incorrect");                                           \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileIdx) == XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \
+                      "Output index tile size is incorrect");                                           \
+    }                                                                                                   \
+    if (outTileVal != NULL)                                                                             \
+    {                                                                                                   \
+      XAI_CHECK_TILE3D(outTileVal);                                                                     \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_TYPE(inTile) == XAI_TILE3D_GET_TYPE(outTileVal), XAI_ERR_DATATYPE, \
+                      "Data type of output tile must be same as input tile");                           \
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileVal);                                                    \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileVal);                                            \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileVal) == numLargestVal, XAI_ERR_DATASIZE,               \
+                      "Output tile size is incorrect");                                                 \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileVal) == XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \
+                      "Output tile size is incorrect");                                                 \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileVal) == XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \
+                      "Output tile size is incorrect");                                                 \
+    }                                                                                                   \
+    if ((outTileVal != NULL) && (outTileIdx != NULL))                                                   \
+    {                                                                                                   \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(outTileIdx, outTileVal);                                        \
+    }                                                                                                   \
+  }
+
+#define XAI_CHECK_CONSISTENCY_MERGE_TOPK_ARGMAX_ARGMIN_3D_DIM1(inTileIdx, inTileVal, outTileIdx, outTileVal, numVal) \
+  {                                                                                                                  \
+    if (outTileIdx != NULL)                                                                                          \
+    {                                                                                                                \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileIdx) == numVal, XAI_ERR_DATASIZE,                                   \
+                      "Output index tile size is incorrect");                                                        \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileIdx) == XAI_TILE3D_GET_DIM2(inTileVal), XAI_ERR_DATASIZE,           \
+                      "Output index tile size is incorrect");                                                        \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileIdx) == XAI_TILE3D_GET_DIM3(inTileVal), XAI_ERR_DATASIZE,           \
+                      "Output index tile size is incorrect");                                                        \
+      XAI_CHECK_TILE3D_S32(outTileIdx);                                                                              \
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileIdx);                                                                 \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTileIdx, outTileIdx);                                                      \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTileVal, outTileIdx);                                                      \
+    }                                                                                                                \
+    if (outTileVal != NULL)                                                                                          \
+    {                                                                                                                \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileVal) == numVal, XAI_ERR_DATASIZE,                                   \
+                      "Output tile size is incorrect");                                                              \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileVal) == XAI_TILE3D_GET_DIM2(inTileVal), XAI_ERR_DATASIZE,           \
+                      "Output tile size is incorrect");                                                              \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileVal) == XAI_TILE3D_GET_DIM3(inTileVal), XAI_ERR_DATASIZE,           \
+                      "Output tile size is incorrect");                                                              \
+      XAI_CHECK_TILE3D(outTileVal);                                                                                  \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_TYPE(inTileVal) == XAI_TILE3D_GET_TYPE(outTileVal), XAI_ERR_DATATYPE,           \
+                      "Data type of output tile must be same as input tile");                                        \
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileVal);                                                                 \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTileVal, outTileVal);                                                      \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTileIdx, outTileVal);                                                      \
+    }                                                                                                                \
+    if ((outTileVal != NULL) && (outTileIdx != NULL))                                                                \
+    {                                                                                                                \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(outTileIdx, outTileVal);                                                     \
+    }                                                                                                                \
+  }
+#define XAI_CHECK_CONSISTENCY_ARGMAX_3D_DIM2(inTile, outTileIdx, outTileVal, numLargestVal)             \
+  {                                                                                                     \
+    if (outTileIdx != NULL)                                                                             \
+    {                                                                                                   \
+      XAI_CHECK_TILE3D_U16(outTileIdx);                                                                 \
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileIdx);                                                    \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileIdx);                                            \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileIdx) == numLargestVal, XAI_ERR_DATASIZE,               \
+                      "Output tile size is incorrect");                                                 \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileIdx) == XAI_TILE3D_GET_DIM1(inTile), XAI_ERR_DATASIZE, \
+                      "Output tile size is incorrect");                                                 \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileIdx) == XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \
+                      "Output tile size is incorrect");                                                 \
+    }                                                                                                   \
+    if (outTileVal != NULL)                                                                             \
+    {                                                                                                   \
+      XAI_CHECK_TILE3D(outTileVal);                                                                     \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_TYPE(inTile) == XAI_TILE3D_GET_TYPE(outTileVal), XAI_ERR_DATATYPE, \
+                      "Data type of output tile must be same as input tile");                           \
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileVal);                                                    \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileVal);                                            \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileVal) == numLargestVal, XAI_ERR_DATASIZE,               \
+                      "Output tile size is incorrect");                                                 \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileVal) == XAI_TILE3D_GET_DIM1(inTile), XAI_ERR_DATASIZE, \
+                      "Output tile size is incorrect");                                                 \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileVal) == XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \
+                      "Output tile size is incorrect");                                                 \
+    }                                                                                                   \
+    if ((outTileVal != NULL) && (outTileIdx != NULL))                                                   \
+    {                                                                                                   \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(outTileIdx, outTileVal);                                        \
+    }                                                                                                   \
+  }
+
+#define XAI_CHECK_CONSISTENCY_ARGMAX_3D_DIM3(inTile, outTileIdx, outTileVal, numLargestVal)             \
+  {                                                                                                     \
+    if (outTileIdx != NULL)                                                                             \
+    {                                                                                                   \
+      XAI_CHECK_TILE3D_U16(outTileIdx);                                                                 \
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileIdx);                                                    \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileIdx);                                            \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileIdx) == numLargestVal, XAI_ERR_DATASIZE,               \
+                      "Output index tile size is incorrect");                                           \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileIdx) == XAI_TILE3D_GET_DIM1(inTile), XAI_ERR_DATASIZE, \
+                      "Output index tile size is incorrect");                                           \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileIdx) == XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \
+                      "Output index tile size is incorrect");                                           \
+    }                                                                                                   \
+    if (outTileVal != NULL)                                                                             \
+    {                                                                                                   \
+      XAI_CHECK_TILE3D(outTileVal);                                                                     \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_TYPE(inTile) == XAI_TILE3D_GET_TYPE(outTileVal), XAI_ERR_DATATYPE, \
+                      "Data type of output tile must be same as input tile");                           \
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileVal);                                                    \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileVal);                                            \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileVal) == numLargestVal, XAI_ERR_DATASIZE,               \
+                      "Output value tile size is incorrect");                                           \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileVal) == XAI_TILE3D_GET_DIM1(inTile), XAI_ERR_DATASIZE, \
+                      "Output value tile size is incorrect");                                           \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileVal) == XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \
+                      "Output value tile size is incorrect");                                           \
+    }                                                                                                   \
+    if ((outTileVal != NULL) && (outTileIdx != NULL))                                                   \
+    {                                                                                                   \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(outTileIdx, outTileVal);                                        \
+    }                                                                                                   \
+  }
+
+#define XAI_CHECK_CONSISTENCY_ARGMAX_3D_DIM1_F32(inTile, outTileIdx, outTileVal, numLargestVal)         \
+  {                                                                                                     \
+    if (outTileIdx != NULL)                                                                             \
+    {                                                                                                   \
+      XAI_CHECK_TILE3D_U32(outTileIdx);                                                                 \
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileIdx);                                                    \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileIdx);                                            \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileIdx) == numLargestVal, XAI_ERR_DATASIZE,               \
+                      "Output index tile size is incorrect");                                           \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileIdx) == XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \
+                      "Output index tile size is incorrect");                                           \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileIdx) == XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \
+                      "Output index tile size is incorrect");                                           \
+    }                                                                                                   \
+    if (outTileVal != NULL)                                                                             \
+    {                                                                                                   \
+      XAI_CHECK_TILE3D(outTileVal);                                                                     \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_TYPE(inTile) == XAI_TILE3D_GET_TYPE(outTileVal), XAI_ERR_DATATYPE, \
+                      "Data type of output tile must be same as input tile");                           \
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileVal);                                                    \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileVal);                                            \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileVal) == numLargestVal, XAI_ERR_DATASIZE,               \
+                      "Output tile size is incorrect");                                                 \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileVal) == XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \
+                      "Output tile size is incorrect");                                                 \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileVal) == XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \
+                      "Output tile size is incorrect");                                                 \
+    }                                                                                                   \
+    if ((outTileVal != NULL) && (outTileIdx != NULL))                                                   \
+    {                                                                                                   \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(outTileIdx, outTileVal);                                        \
+    }                                                                                                   \
+  }
+
+#define XAI_CHECK_CONSISTENCY_ARGMAX_3D_DIM2_F32(inTile, outTileIdx, outTileVal, numLargestVal)         \
+  {                                                                                                     \
+    if (outTileIdx != NULL)                                                                             \
+    {                                                                                                   \
+      XAI_CHECK_TILE3D_U32(outTileIdx);                                                                 \
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileIdx);                                                    \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileIdx);                                            \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileIdx) == numLargestVal, XAI_ERR_DATASIZE,               \
+                      "Output tile size is incorrect");                                                 \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileIdx) == XAI_TILE3D_GET_DIM1(inTile), XAI_ERR_DATASIZE, \
+                      "Output tile size is incorrect");                                                 \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileIdx) == XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \
+                      "Output tile size is incorrect");                                                 \
+    }                                                                                                   \
+    if (outTileVal != NULL)                                                                             \
+    {                                                                                                   \
+      XAI_CHECK_TILE3D(outTileVal);                                                                     \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_TYPE(inTile) == XAI_TILE3D_GET_TYPE(outTileVal), XAI_ERR_DATATYPE, \
+                      "Data type of output tile must be same as input tile");                           \
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileVal);                                                    \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileVal);                                            \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileVal) == numLargestVal, XAI_ERR_DATASIZE,               \
+                      "Output tile size is incorrect");                                                 \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileVal) == XAI_TILE3D_GET_DIM1(inTile), XAI_ERR_DATASIZE, \
+                      "Output tile size is incorrect");                                                 \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileVal) == XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \
+                      "Output tile size is incorrect");                                                 \
+    }                                                                                                   \
+    if ((outTileVal != NULL) && (outTileIdx != NULL))                                                   \
+    {                                                                                                   \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(outTileIdx, outTileVal);                                        \
+    }                                                                                                   \
+  }
+
+#define XAI_CHECK_CONSISTENCY_ARGMAX_3D_DIM3_F32(inTile, outTileIdx, outTileVal, numLargestVal)         \
+  {                                                                                                     \
+    if (outTileIdx != NULL)                                                                             \
+    {                                                                                                   \
+      XAI_CHECK_TILE3D_U32(outTileIdx);                                                                 \
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileIdx);                                                    \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileIdx);                                            \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileIdx) == numLargestVal, XAI_ERR_DATASIZE,               \
+                      "Output index tile size is incorrect");                                           \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileIdx) == XAI_TILE3D_GET_DIM1(inTile), XAI_ERR_DATASIZE, \
+                      "Output index tile size is incorrect");                                           \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileIdx) == XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \
+                      "Output index tile size is incorrect");                                           \
+    }                                                                                                   \
+    if (outTileVal != NULL)                                                                             \
+    {                                                                                                   \
+      XAI_CHECK_TILE3D(outTileVal);                                                                     \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_TYPE(inTile) == XAI_TILE3D_GET_TYPE(outTileVal), XAI_ERR_DATATYPE, \
+                      "Data type of output tile must be same as input tile");                           \
+      XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileVal);                                                    \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileVal);                                            \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileVal) == numLargestVal, XAI_ERR_DATASIZE,               \
+                      "Output value tile size is incorrect");                                           \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileVal) == XAI_TILE3D_GET_DIM1(inTile), XAI_ERR_DATASIZE, \
+                      "Output value tile size is incorrect");                                           \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileVal) == XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \
+                      "Output value tile size is incorrect");                                           \
+    }                                                                                                   \
+    if ((outTileVal != NULL) && (outTileIdx != NULL))                                                   \
+    {                                                                                                   \
+      XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(outTileIdx, outTileVal);                                        \
+    }                                                                                                   \
+  }
+
+#define XAI_CHECK_DIM_IN128DWH(coeffIn, coeffOut)                                                                     \
+  {                                                                                                                   \
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTileOut) % 128) == 0, XAI_ERR_DATASIZE,                                 \
+                    "The dimension 1 of the output tile should be a multiple of 128");                                \
+                                                                                                                      \
+    if ((XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN) ||                                                       \
+        (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN))                                                         \
+    {                                                                                                                 \
+      XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) << 7) ==                                                     \
+                      (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM4(coeffTileIn), 2 * XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \
+                      "The allocated output channels size in the IN128DWH tile is a multiple of 128");                \
+    }                                                                                                                 \
+    else                                                                                                              \
+    {                                                                                                                 \
+      XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) << 7) ==                                                     \
+                      (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM1(coeffTileIn), 2 * XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \
+                      "The dimension 2 of the output tile should be a multiple of 128");                              \
+    }                                                                                                                 \
+  }
+#if (XCHAL_IVPN_SIMD_WIDTH == 64)
+#define XAI_CHECK_DIM_IN64DWH(coeffIn, coeffOut)                                                                  \
+  {                                                                                                               \
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTileOut) % 64) == 0, XAI_ERR_DATASIZE,                              \
+                    "The dimension 1 of the output tile should be a multiple of 64");                             \
+                                                                                                                  \
+    if ((XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN) ||                                                   \
+        (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN))                                                     \
+    {                                                                                                             \
+      XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) << 6) ==                                                 \
+                      (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM4(coeffTileIn), XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \
+                      "The allocated output channels size in the IN64DWH tile is a multiple of 64");              \
+    }                                                                                                             \
+    else                                                                                                          \
+    {                                                                                                             \
+      XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) << 6) ==                                                 \
+                      (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM1(coeffTileIn), XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \
+                      "The dimension 2 of the output tile should be a multiple of 64");                           \
+    }                                                                                                             \
+  }
+
+#define XAI_CHECK_DIM_IN32DWH(coeffIn, coeffOut)                                                                  \
+  {                                                                                                               \
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTileOut) % 32) == 0, XAI_ERR_DATASIZE,                              \
+                    "The dimension 1 of the output tile should be a multiple of 32");                             \
+                                                                                                                  \
+    if ((XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN) ||                                                   \
+        (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN))                                                     \
+    {                                                                                                             \
+      XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) << 5) ==                                                 \
+                      (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM4(coeffTileIn), XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \
+                      "The allocated output channels size in the IN32DWH tile is a multiple of 32");              \
+    }                                                                                                             \
+    else                                                                                                          \
+    {                                                                                                             \
+      XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) << 5) ==                                                 \
+                      (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM1(coeffTileIn), XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \
+                      "The dimension 2 of the output tile should be a multiple of 32");                           \
+    }                                                                                                             \
+  }
+
+#else
+#define XAI_CHECK_DIM_IN64DWH(coeffIn, coeffOut)                                                                      \
+  {                                                                                                                   \
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTileOut) % (2 * XCHAL_IVPN_SIMD_WIDTH)) == 0, XAI_ERR_DATASIZE,         \
+                    "The dimension 1 of the output tile should be a multiple of 64");                                 \
+                                                                                                                      \
+    if ((XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN) ||                                                       \
+        (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN))                                                         \
+    {                                                                                                                 \
+      XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) * (2 * XCHAL_IVPN_SIMD_WIDTH)) ==                            \
+                      (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM4(coeffTileIn), 2 * XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \
+                      "The allocated output channels size in the IN64DWH tile is a multiple of 64");                  \
+    }                                                                                                                 \
+    else                                                                                                              \
+    {                                                                                                                 \
+      XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) * (2 * XCHAL_IVPN_SIMD_WIDTH)) ==                            \
+                      (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM1(coeffTileIn), 2 * XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \
+                      "The dimension 2 of the output tile should be a multiple of 64");                               \
+    }                                                                                                                 \
+  }
+
+#define XAI_CHECK_DIM_IN32DWH(coeffIn, coeffOut)                                                                  \
+  {                                                                                                               \
+    XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTileOut) % XCHAL_IVPN_SIMD_WIDTH) == 0, XAI_ERR_DATASIZE,           \
+                    "The dimension 1 of the output tile should be a multiple of 32");                             \
+                                                                                                                  \
+    if ((XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN) ||                                                   \
+        (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN))                                                     \
+    {                                                                                                             \
+      XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) * XCHAL_IVPN_SIMD_WIDTH) ==                              \
+                      (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM4(coeffTileIn), XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \
+                      "The allocated output channels size in the IN32DWH tile is a multiple of 32");              \
+    }                                                                                                             \
+    else                                                                                                          \
+    {                                                                                                             \
+      XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) * XCHAL_IVPN_SIMD_WIDTH) ==                              \
+                      (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM1(coeffTileIn), XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \
+                      "The dimension 2 of the output tile should be a multiple of 32");                           \
+    }                                                                                                             \
+  }
+#endif
+
+#define XAI_CHECK_COEFF_IN_DATA_ORDER_FC(coeffIn)                                     \
+  {                                                                                   \
+    XAI_CHECK_ERROR(((XAI_TILE3D_GET_DATA_ORDER(coeffIn) == XAI_NWHD) ||              \
+                     (XAI_TILE3D_GET_DATA_ORDER(coeffIn) == XAI_NDWH)),               \
+                    XAI_ERR_BADARG, "\nData Order of the given tiles not supported"); \
+  }
+
+/* To set appropriate pitch size for broadcast/normal elementwise operations */
+#define  XAI_TILE3D_GET_BCAST23_PITCH(inTile1, inTile2, outTile, in1Stride, in2Stride, \
+                                      in1Pitch1, in1Pitch2, in2Pitch1, in2Pitch2)      \
+  {                                                                                    \
+    int32_t m_in1Dim2, m_in1Dim3, m_in2Dim2, m_in2Dim3;                                \
+    m_in1Dim2 = (XAI_TILE3D_GET_DIM2(inTile1) + in1Stride - 1) / in1Stride;            \
+    m_in1Dim3 = (XAI_TILE3D_GET_DIM3(inTile1) + in1Stride - 1) / in1Stride;            \
+    m_in2Dim2 = (XAI_TILE3D_GET_DIM2(inTile2) + in2Stride - 1) / in2Stride;            \
+    m_in2Dim3 = (XAI_TILE3D_GET_DIM3(inTile2) + in2Stride - 1) / in2Stride;            \
+    in1Pitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile1);                                    \
+    in1Pitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile1);                                    \
+    in2Pitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile2);                                    \
+    in2Pitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile2);                                    \
+    in1Pitch1 = m_in1Dim2 == XAI_TILE3D_GET_DIM2(outTile) ? in1Pitch1 : 0;             \
+    in1Pitch2 = m_in1Dim3 == XAI_TILE3D_GET_DIM3(outTile) ? in1Pitch2 : 0;             \
+    in2Pitch1 = m_in2Dim2 == XAI_TILE3D_GET_DIM2(outTile) ? in2Pitch1 : 0;             \
+    in2Pitch2 = m_in2Dim3 == XAI_TILE3D_GET_DIM3(outTile) ? in2Pitch2 : 0;             \
+  }
+
+#define XAI_CHECK_REDUCE_DIM(inTile, outTile, params)                                                                                   \
+  {                                                                                                                                     \
+    if ((XAI_CNN_REDUCE_GET_CONFIG(params) & XAI_CNN_REDUCE_DIM1) != XAI_CNN_REDUCE_DIM1)                                               \
+    {                                                                                                                                   \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1(outTile), XAI_ERR_DATASIZE,                                    \
+                      "\nInput tile dim1size = %d, Output tile dim1size = %d\nFirst dimension of input and output tile must be equal",  \
+                      XAI_TILE3D_GET_DIM1(inTile), XAI_TILE3D_GET_DIM1(outTile));                                                       \
+    }                                                                                                                                   \
+    else                                                                                                                                \
+    {                                                                                                                                   \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTile) == 1, XAI_ERR_DATASIZE,                                                              \
+                      "\nOutput tile dim1size = %d, size should be equal to 1", XAI_TILE3D_GET_DIM1(outTile));                          \
+    }                                                                                                                                   \
+    if ((XAI_CNN_REDUCE_GET_CONFIG(params) & XAI_CNN_REDUCE_DIM2) != XAI_CNN_REDUCE_DIM2)                                               \
+    {                                                                                                                                   \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(inTile) == XAI_TILE3D_GET_DIM2(outTile), XAI_ERR_DATASIZE,                                    \
+                      "\nInput tile dim2size = %d, Output tile dim2size = %d\nSecond dimension of input and output tile must be equal", \
+                      XAI_TILE3D_GET_DIM2(inTile), XAI_TILE3D_GET_DIM2(outTile));                                                       \
+    }                                                                                                                                   \
+    else                                                                                                                                \
+    {                                                                                                                                   \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTile) == 1, XAI_ERR_DATASIZE,                                                              \
+                      "\nOutput tile dim2size = %d, size should be equal to 1", XAI_TILE3D_GET_DIM2(outTile));                          \
+    }                                                                                                                                   \
+    if ((XAI_CNN_REDUCE_GET_CONFIG(params) & XAI_CNN_REDUCE_DIM3) != XAI_CNN_REDUCE_DIM3)                                               \
+    {                                                                                                                                   \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inTile) == XAI_TILE3D_GET_DIM3(outTile), XAI_ERR_DATASIZE,                                    \
+                      "\nInput tile dim3size = %d, Output tile dim3size = %d\nThird dimension of input and output tile must be equal",  \
+                      XAI_TILE3D_GET_DIM3(inTile), XAI_TILE3D_GET_DIM3(outTile));                                                       \
+    }                                                                                                                                   \
+    else                                                                                                                                \
+    {                                                                                                                                   \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTile) == 1, XAI_ERR_DATASIZE,                                                              \
+                      "\nOutput tile dim3size = %d, size should be equal to 1", XAI_TILE3D_GET_DIM3(outTile));                          \
+    }                                                                                                                                   \
+  }
+
+#define XAI_CHECK_REDUCE_DIM4D(inTile, outTile, params)                                                                  \
+  {                                                                                                                      \
+    if ((XAI_CNN_REDUCE_GET_CONFIG(params) & XAI_CNN_REDUCE_DIM1) != XAI_CNN_REDUCE_DIM1)                                \
+    {                                                                                                                    \
+      XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(inTile) == XAI_TILE4D_GET_DIM1(outTile), XAI_ERR_DATASIZE,                     \
+                      "\nInput tile dim1size = %d, Output tile dim1size = %d\nInequality in first dimension",            \
+                      XAI_TILE4D_GET_DIM1(inTile), XAI_TILE4D_GET_DIM1(outTile));                                        \
+    }                                                                                                                    \
+    else                                                                                                                 \
+    {                                                                                                                    \
+      XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(outTile) == 1, XAI_ERR_DATASIZE,                                               \
+                      "\nOutput tile dim1size = %d, output first dimension should be 1", XAI_TILE4D_GET_DIM1(outTile));  \
+    }                                                                                                                    \
+    if ((XAI_CNN_REDUCE_GET_CONFIG(params) & XAI_CNN_REDUCE_DIM2) != XAI_CNN_REDUCE_DIM2)                                \
+    {                                                                                                                    \
+      XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(inTile) == XAI_TILE4D_GET_DIM2(outTile), XAI_ERR_DATASIZE,                     \
+                      "\nInput tile dim2size = %d, Output tile dim2size = %d\nInequality in second dimension",           \
+                      XAI_TILE4D_GET_DIM2(inTile), XAI_TILE4D_GET_DIM2(outTile));                                        \
+    }                                                                                                                    \
+    else                                                                                                                 \
+    {                                                                                                                    \
+      XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(outTile) == 1, XAI_ERR_DATASIZE,                                               \
+                      "\nOutput tile dim2size = %d, output second dimension should be 1", XAI_TILE4D_GET_DIM2(outTile)); \
+    }                                                                                                                    \
+    if ((XAI_CNN_REDUCE_GET_CONFIG(params) & XAI_CNN_REDUCE_DIM3) != XAI_CNN_REDUCE_DIM3)                                \
+    {                                                                                                                    \
+      XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM3(inTile) == XAI_TILE4D_GET_DIM3(outTile), XAI_ERR_DATASIZE,                     \
+                      "\nInput tile dim3size = %d, Output tile dim3size = %d\nInequality in third dimension",            \
+                      XAI_TILE4D_GET_DIM3(inTile), XAI_TILE4D_GET_DIM3(outTile));                                        \
+    }                                                                                                                    \
+    else                                                                                                                 \
+    {                                                                                                                    \
+      XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM3(outTile) == 1, XAI_ERR_DATASIZE,                                               \
+                      "\nOutput tile dim3size = %d, output third dimension should be 1", XAI_TILE4D_GET_DIM3(outTile));  \
+    }                                                                                                                    \
+    if ((XAI_CNN_REDUCE_GET_CONFIG(params) & XAI_CNN_REDUCE_DIM4) != XAI_CNN_REDUCE_DIM4)                                \
+    {                                                                                                                    \
+      XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM4(inTile) == XAI_TILE4D_GET_DIM4(outTile), XAI_ERR_DATASIZE,                     \
+                      "\nInput tile dim4size = %d, Output tile dim4size = %d\nInequality in fourth dimension",           \
+                      XAI_TILE4D_GET_DIM4(inTile), XAI_TILE4D_GET_DIM4(outTile));                                        \
+    }                                                                                                                    \
+    else                                                                                                                 \
+    {                                                                                                                    \
+      XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM4(outTile) == 1, XAI_ERR_DATASIZE,                                               \
+                      "\nOutput tile dim3size = %d, output fourth dimension should be 1", XAI_TILE4D_GET_DIM4(outTile)); \
+    }                                                                                                                    \
+    XAI_CHECK_ERROR(XAI_CNN_REDUCE_GET_TILEFLAG(params) <= XAI_CNN_REDUCE_FIRST_LAST_TILE, XAI_ERR_BADARG,               \
+                    "\nTile Flag = %hhu, Incorrect Tile Flag", XAI_CNN_REDUCE_GET_TILEFLAG(params));                     \
+  }
+
+#define XAI_CHECK_TILE3D_SIZE_BCAST_EQ(in, out, inStride)                                                                       \
+  if (XAI_TILE3D_GET_DATA_ORDER(in) == XAI_WHD)                                                                                 \
+  {                                                                                                                             \
+    XAI_CHECK_ERROR(                                                                                                            \
+      ((((XAI_TILE3D_GET_DIM1(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM1(out)) || (XAI_TILE3D_GET_DIM1(in) == 1)) && \
+       (((XAI_TILE3D_GET_DIM2(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM2(out)) || (XAI_TILE3D_GET_DIM2(in) == 1)) && \
+       ((XAI_TILE3D_GET_DIM3(in) == XAI_TILE3D_GET_DIM3(out)) || XAI_TILE3D_GET_DIM3(in) == 1)), XAI_ERR_DATASIZE,              \
+      "Invalid dimension in (" #in ") or (" #out ") to perform Elementwise broadcast operation");                               \
+  }                                                                                                                             \
+  else if (XAI_TILE3D_GET_DATA_ORDER(in) == XAI_DWH)                                                                            \
+  {                                                                                                                             \
+    XAI_CHECK_ERROR(                                                                                                            \
+      ((((XAI_TILE3D_GET_DIM2(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM2(out)) || (XAI_TILE3D_GET_DIM2(in) == 1)) && \
+       (((XAI_TILE3D_GET_DIM3(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM3(out)) || (XAI_TILE3D_GET_DIM3(in) == 1)) && \
+       ((XAI_TILE3D_GET_DIM1(in) == XAI_TILE3D_GET_DIM1(out)) || XAI_TILE3D_GET_DIM1(in) == 1)), XAI_ERR_DATASIZE,              \
+      "Invalid dimension in (" #in ") or (" #out ") to perform Elementwise broadcast operation");                               \
+  }
+
+#define XAI_CHECK_TILE3D_SIZE_EQ_OR_BCAST(in, out, inStride)                                                 \
+  if (XAI_TILE3D_GET_DATA_ORDER(in) == XAI_WHD)                                                              \
+  {                                                                                                          \
+    if ((XAI_TILE3D_GET_DIM3(in) == XAI_TILE3D_GET_DIM3(out)) &&                                             \
+        ((XAI_TILE3D_GET_DIM1(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM1(out)) &&                 \
+        ((XAI_TILE3D_GET_DIM2(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM2(out)))                   \
+    {                                                                                                        \
+      if (XAI_TILE3D_GET_DATA_PTR(in) == XAI_TILE3D_GET_DATA_PTR(out))                                       \
+      {                                                                                                      \
+        XAI_CHECK_ERROR(XAI_TILE3D_PITCH_EQ(in, out), XAI_ERR_INPLACE, "Inplace operation not "              \
+                        "supported when pitch of ("#in ") and ("#out ") are not same");                      \
+      }                                                                                                      \
+    }                                                                                                        \
+    else                                                                                                     \
+    {                                                                                                        \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_PTR(in) != XAI_TILE3D_GET_DATA_PTR(out), XAI_ERR_INPLACE,          \
+                      "Inplace operation not supported for Broadcast Operation for ("#in ") and ("#out ")"); \
+      XAI_CHECK_TILE3D_SIZE_BCAST_EQ(in, out, inStride);                                                     \
+    }                                                                                                        \
+  }                                                                                                          \
+  else if (XAI_TILE3D_GET_DATA_ORDER(in) == XAI_DWH)                                                         \
+  {                                                                                                          \
+    if ((XAI_TILE3D_GET_DIM1(in) == XAI_TILE3D_GET_DIM1(out)) &&                                             \
+        ((XAI_TILE3D_GET_DIM2(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM2(out)) &&                 \
+        ((XAI_TILE3D_GET_DIM3(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM3(out)))                   \
+    {                                                                                                        \
+      if (XAI_TILE3D_GET_DATA_PTR(in) == XAI_TILE3D_GET_DATA_PTR(out))                                       \
+      {                                                                                                      \
+        XAI_CHECK_ERROR(XAI_TILE3D_PITCH_EQ(in, out), XAI_ERR_INPLACE, "Inplace operation not "              \
+                        "supported when pitch of ("#in ") and ("#out ") are not same");                      \
+      }                                                                                                      \
+    }                                                                                                        \
+    else                                                                                                     \
+    {                                                                                                        \
+      XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_PTR(in) != XAI_TILE3D_GET_DATA_PTR(out), XAI_ERR_INPLACE,          \
+                      "Inplace operation not supported for Broadcast Operation for ("#in ") and ("#out ")"); \
+      XAI_CHECK_TILE3D_SIZE_BCAST_EQ(in, out, inStride);                                                     \
+    }                                                                                                        \
+  }
+
+#define XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, in1Stride, in2Stride)                                \
+  XAI_CHECK_TILE3D_SIZE_EQ_OR_BCAST(inTile1, outTile, in1Stride);                                                         \
+  XAI_CHECK_TILE3D_SIZE_EQ_OR_BCAST(inTile2, outTile, in2Stride);                                                         \
+  if (XAI_TILE3D_GET_DATA_ORDER(outTile) == XAI_WHD)                                                                      \
+  {                                                                                                                       \
+    XAI_CHECK_ERROR((MAX2(XAI_TILE3D_GET_DIM3(inTile1), XAI_TILE3D_GET_DIM3(inTile2)) == XAI_TILE3D_GET_DIM3(outTile)) && \
+                    (MAX2((XAI_TILE3D_GET_DIM1(inTile1) + in1Stride - 1) / in1Stride,                                     \
+                          (XAI_TILE3D_GET_DIM1(inTile2) + in2Stride - 1) / in2Stride) == XAI_TILE3D_GET_DIM1(outTile)) && \
+                    (MAX2((XAI_TILE3D_GET_DIM2(inTile1) + in1Stride - 1) / in1Stride,                                     \
+                          (XAI_TILE3D_GET_DIM2(inTile2) + in2Stride - 1) / in2Stride) == XAI_TILE3D_GET_DIM2(outTile)),   \
+                    XAI_ERR_DATASIZE, "Invalid dimension to perform BroadCast/ElementWise operations");                   \
+  }                                                                                                                       \
+  else if (XAI_TILE3D_GET_DATA_ORDER(outTile) == XAI_DWH)                                                                 \
+  {                                                                                                                       \
+    XAI_CHECK_ERROR((MAX2(XAI_TILE3D_GET_DIM1(inTile1), XAI_TILE3D_GET_DIM1(inTile2)) == XAI_TILE3D_GET_DIM1(outTile)) && \
+                    (MAX2((XAI_TILE3D_GET_DIM2(inTile1) + in1Stride - 1) / in1Stride,                                     \
+                          (XAI_TILE3D_GET_DIM2(inTile2) + in2Stride - 1) / in2Stride) == XAI_TILE3D_GET_DIM2(outTile)) && \
+                    (MAX2((XAI_TILE3D_GET_DIM3(inTile1) + in1Stride - 1) / in1Stride,                                     \
+                          (XAI_TILE3D_GET_DIM3(inTile2) + in2Stride - 1) / in2Stride) == XAI_TILE3D_GET_DIM3(outTile)),   \
+                    XAI_ERR_DATASIZE, "Invalid dimension to perform BroadCast/ElementWise operations");                   \
+  }
+
+#define XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \
+                                      inTile2Pitch1, inTile1Pitch2, inTile2Pitch2)                   \
+  int32_t inTile1Pitch0 = 1;                                                                         \
+  int32_t inTile1Pitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile1);                                        \
+  int32_t inTile1Pitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile1);                                        \
+  int32_t inTile2Pitch0 = 1;                                                                         \
+  int32_t inTile2Pitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile2);                                        \
+  int32_t inTile2Pitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile2);                                        \
+  if (XAI_TILE3D_GET_DIM1(inTile1) == 1) {                                                           \
+    inTile1Pitch0 = 0; }                                                                             \
+  else if (XAI_TILE3D_GET_DIM1(inTile2) == 1) {                                                      \
+    inTile2Pitch0 = 0; }                                                                             \
+  if (XAI_TILE3D_GET_DIM2(inTile1) == 1) {                                                           \
+    inTile1Pitch1 = 0; }                                                                             \
+  else if (XAI_TILE3D_GET_DIM2(inTile2) == 1) {                                                      \
+    inTile2Pitch1 = 0; }                                                                             \
+  if (XAI_TILE3D_GET_DIM3(inTile1) == 1) {                                                           \
+    inTile1Pitch2 = 0; }                                                                             \
+  else if (XAI_TILE3D_GET_DIM3(inTile2) == 1) {                                                      \
+    inTile2Pitch2 = 0; }
+
+#define XAI_TILE3D_SIZE_BCAST23_EQ(in, out, inStride)                                                                       \
+  ((((XAI_TILE3D_GET_DIM2(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM2(out)) || (XAI_TILE3D_GET_DIM2(in) == 1)) && \
+   (((XAI_TILE3D_GET_DIM3(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM3(out)) || (XAI_TILE3D_GET_DIM3(in) == 1)) && \
+   (XAI_TILE3D_GET_DIM1(in) == XAI_TILE3D_GET_DIM1(out)))
+
+#define XAI_CHECK_TILE3D_SIZE_EQ_OR_BCAST23(in, out, inStride)                                                  \
+  if ((XAI_TILE3D_GET_DIM1(in) == XAI_TILE3D_GET_DIM1(out)) &&                                                  \
+      ((XAI_TILE3D_GET_DIM2(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM2(out)) &&                      \
+      ((XAI_TILE3D_GET_DIM3(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM3(out)))                        \
+  {                                                                                                             \
+    if (XAI_TILE3D_GET_DATA_PTR(in) == XAI_TILE3D_GET_DATA_PTR(out))                                            \
+    {                                                                                                           \
+      XAI_CHECK_ERROR(XAI_TILE3D_PITCH_EQ(in, out), XAI_ERR_INPLACE, "Inplace operation not "                   \
+                      "supported when pitch of ("#in ") and ("#out ") are not same");                           \
+    }                                                                                                           \
+  }                                                                                                             \
+  else                                                                                                          \
+  {                                                                                                             \
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_PTR(in) != XAI_TILE3D_GET_DATA_PTR(out), XAI_ERR_INPLACE,               \
+                    "Inplace operation not supported for Broadcast Operation for ("#in ") and ("#out ")");      \
+    XAI_CHECK_ERROR(XAI_TILE3D_SIZE_BCAST23_EQ(in, out, inStride), XAI_ERR_DATASIZE,                            \
+                    "Invalid dimension in (" #in ") or (" #out ") to perform Elementwise broadcast operation"); \
+  }
+
+#define XAI_CHECK_TILE3D_BCAST23_DIMENSIONS(inTile1, inTile2, outTile, in1Stride, in2Stride)                            \
+  XAI_CHECK_TILE3D_SIZE_EQ_OR_BCAST23(inTile1, outTile, in1Stride)                                                      \
+  XAI_CHECK_TILE3D_SIZE_EQ_OR_BCAST23(inTile2, outTile, in2Stride)                                                      \
+  XAI_CHECK_ERROR((MAX2(XAI_TILE3D_GET_DIM1(inTile1), XAI_TILE3D_GET_DIM1(inTile2)) ==                                  \
+                   XAI_TILE3D_GET_DIM1(outTile)) &&                                                                     \
+                  (MAX2((XAI_TILE3D_GET_DIM2(inTile1) + in1Stride - 1) / in1Stride,                                     \
+                        (XAI_TILE3D_GET_DIM2(inTile2) + in2Stride - 1) / in2Stride) == XAI_TILE3D_GET_DIM2(outTile)) && \
+                  (MAX2((XAI_TILE3D_GET_DIM3(inTile1) + in1Stride - 1) / in1Stride,                                     \
+                        (XAI_TILE3D_GET_DIM3(inTile2) + in2Stride - 1) / in2Stride) == XAI_TILE3D_GET_DIM3(outTile)),   \
+                  XAI_ERR_DATASIZE, "Invalid dimension to perform BroadCast/ElementWise operations")
+
+#define XAI_CHECK_LSTM_BLOCK(functionCall)                                             \
+  {                                                                                    \
+    int32_t retVal = (functionCall);                                                   \
+    (void) retVal;                                                                     \
+    XAI_ERROR_CHECKS_CONTINUE()                                                        \
+    {                                                                                  \
+      XAI_CHECK_ERROR((retVal == XAI_ERR_OK), retVal,                                  \
+                      "\nError in file: %s, function: %s, LSTM block: %s, line: %d\n", \
+                      __FILE__, __func__, #functionCall, __LINE__);                    \
+    }                                                                                  \
+  }                                                                                    \
+
diff --git a/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_version.h b/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_version.h
new file mode 100644
index 00000000000..18e97cc9d49
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_version.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#ifndef __XAI_CNN_VERSION_H__
+#define __XAI_CNN_VERSION_H__
+
+#if ((XCHAL_VISION_TYPE >= 6) || (XCHAL_HAVE_BBENEP == 1))
+#if (!defined(GLOW_BUILD) && !defined(MLIR_BUILD) && !defined(XNNC_PROJ_MGR_PROJECT))
+#include <xtensa/tie/xt_ivpn.h>
+#endif
+#endif
+
+#if (XCHAL_VISION_TYPE == 6 && XCHAL_VISION_SIMD16 == 8) //VP1, V110
+
+#define XAI_CNN_LIBRARY_DSP_PROCESSOR              P1
+#define XAI_CNN_LIBRARY_VERSION_MAJOR              2
+#define XAI_CNN_LIBRARY_VERSION_MINOR              0
+#define XAI_CNN_LIBRARY_VERSION_PATCH              0
+#define XAI_CNN_LIBRARY_VERSION_INTERNAL_TRACKING  0
+
+#elif (XCHAL_VISION_TYPE == 6) // VP6, V130
+
+#define XAI_CNN_LIBRARY_DSP_PROCESSOR              P6
+#define XAI_CNN_LIBRARY_VERSION_MAJOR              2
+#define XAI_CNN_LIBRARY_VERSION_MINOR              0
+#define XAI_CNN_LIBRARY_VERSION_PATCH              0
+#define XAI_CNN_LIBRARY_VERSION_INTERNAL_TRACKING  0
+
+#elif ((XCHAL_VISION_TYPE == 7) || ((XCHAL_VISION_TYPE == 9) && (XCHAL_IVPN_SIMD_WIDTH == 32)))   //VQ7, V240, V331, NeuroEdge
+#define XAI_CNN_LIBRARY_DSP_PROCESSOR              Q7
+#define XAI_CNN_LIBRARY_VERSION_MAJOR              2
+#define XAI_CNN_LIBRARY_VERSION_MINOR              0
+#define XAI_CNN_LIBRARY_VERSION_PATCH              0
+#define XAI_CNN_LIBRARY_VERSION_INTERNAL_TRACKING  0
+
+#elif ((XCHAL_VISION_TYPE >= 8) || ((XCHAL_HAVE_BBENEP == 1) && (XCHAL_BBEN_SIMD_WIDTH == 64))) // VQ8, V240, V341, MathX_240
+
+#define XAI_CNN_LIBRARY_DSP_PROCESSOR              Q8
+#define XAI_CNN_LIBRARY_VERSION_MAJOR              2
+#define XAI_CNN_LIBRARY_VERSION_MINOR              0
+#define XAI_CNN_LIBRARY_VERSION_PATCH              0
+#define XAI_CNN_LIBRARY_VERSION_INTERNAL_TRACKING  0
+
+#elif (XCHAL_HAVE_HIFI1 || XCHAL_HAVE_HIFI3Z || XCHAL_HAVE_HIFI4 || XCHAL_HAVE_HIFI5) //HiFi
+
+#define XAI_CNN_LIBRARY_DSP_PROCESSOR              HIFI
+#define XAI_CNN_LIBRARY_VERSION_MAJOR              2
+#define XAI_CNN_LIBRARY_VERSION_MINOR              0
+#define XAI_CNN_LIBRARY_VERSION_PATCH              0
+#define XAI_CNN_LIBRARY_VERSION_INTERNAL_TRACKING  0
+
+#else
+
+#define XAI_CNN_LIBRARY_DSP_PROCESSOR              REFF
+#define XAI_CNN_LIBRARY_VERSION_MAJOR              2
+#define XAI_CNN_LIBRARY_VERSION_MINOR              0
+#define XAI_CNN_LIBRARY_VERSION_PATCH              0
+#define XAI_CNN_LIBRARY_VERSION_INTERNAL_TRACKING  0
+#endif //if Processor type
+
+#define XAI_AUX_STR_EXP(__A)  #__A
+#define XAI_AUX_STR(__A)      XAI_AUX_STR_EXP(__A)
+#define XAI_CNN_LIBRARY_VERSION_STR  XAI_AUX_STR(XAI_CNN_LIBRARY_DSP_PROCESSOR) "." XAI_AUX_STR(XAI_CNN_LIBRARY_VERSION_MAJOR) "." XAI_AUX_STR(XAI_CNN_LIBRARY_VERSION_MINOR) "." XAI_AUX_STR(XAI_CNN_LIBRARY_VERSION_PATCH) "." XAI_AUX_STR(XAI_CNN_LIBRARY_VERSION_INTERNAL_TRACKING)
+#endif /* __XAI_CNN_VERSION_H__ */
diff --git a/backends/cadence/vision/third-party/libxai_common/include/xai_config_api.h b/backends/cadence/vision/third-party/libxai_common/include/xai_config_api.h
new file mode 100644
index 00000000000..2e2b6811fea
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/include/xai_config_api.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2021 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#ifndef __XAI_CONFIG_API_H__
+#define __XAI_CONFIG_API_H__
+
+#ifndef XAI_REF_ONLY_COMPILATION
+#include <xtensa/config/core-isa.h>
+#endif
+
+// Contains IVP to BBE mappings
+#if (XCHAL_HAVE_BBENEP == 1)
+#include <xtensa/tie/xt_ivpn.h>
+#endif
+
+#include "xai_cnn_version.h"
+
+#ifndef __XTENSA__
+    #if defined(_MSC_VER)
+        #pragma warning (disable : 4005 )
+    #endif
+    #ifdef __cplusplus
+        #if defined(_MSC_VER) && (_MSC_VER >= 1900)
+            #define restrict  __restrict
+        #else
+            #define restrict
+        #endif
+    #endif
+    #ifndef XCHAL_NUM_DATARAM
+        #define XCHAL_NUM_DATARAM  2
+    #endif
+#endif
+
+#if !defined(__XTENSA__) || !(defined(XCHAL_HAVE_VISION) || defined(XCHAL_HAVE_BBENEP)) || !(XCHAL_HAVE_VISION || XCHAL_HAVE_BBENEP)
+#   define XV_EMULATE_DMA
+#endif
+
+// #define XAI_EMULATE_LOCAL_RAM 0
+#ifndef XAI_EMULATE_LOCAL_RAM
+#  define XAI_EMULATE_LOCAL_RAM  1
+#endif
+
+/* XI Library API qualifiers */
+
+#if XAI_EMULATE_LOCAL_RAM && __XTENSA__
+#if XCHAL_NUM_DATARAM == 2
+#  define _XAI_LOCAL_RAM0_  __attribute__((section(".dram0.data")))
+#  define _XAI_LOCAL_RAM1_  __attribute__((section(".dram1.data")))
+#elif XCHAL_NUM_DATARAM == 1
+#  define _XAI_LOCAL_RAM0_  __attribute__((section(".dram0.data")))
+#endif
+#  define _XAI_LOCAL_IRAM_  __attribute__((section(".iram0.text")))
+#else
+#  define _XAI_LOCAL_RAM0_
+#  define _XAI_LOCAL_RAM1_
+#  define _XAI_LOCAL_IRAM_
+#endif
+
+#if !defined(_XAI_EXPORTS_)
+#  if defined __GNUC__ && __GNUC__ >= 4
+#    define _XAI_EXPORTS_  __attribute__((visibility("default")))
+#  elif defined(_MSC_VER)
+#    if defined(XAI_CREATE_SHARED_LIBRARY)
+#      define _XAI_EXPORTS_  __declspec(dllexport)
+#    else
+#      define _XAI_EXPORTS_  __declspec(dllimport)
+#    endif
+#  else
+#    define _XAI_EXPORTS_
+#  endif
+#endif
+
+#ifdef __cplusplus
+#  define _XAI_EXTERN_C_  extern "C"
+#else
+#  define _XAI_EXTERN_C_  extern
+#endif
+
+#ifdef __cplusplus
+#  define XAI_DEFAULT(value) = (value)
+#else
+#  define XAI_DEFAULT(value)
+#endif
+
+#if defined(__XTENSA__) && (!defined(DISABLE_AGGRESSIVE_INLINE))
+#define _XAI_INLINE_  __attribute((always_inline))
+#else
+#define _XAI_INLINE_
+#endif
+
+#ifdef GLOW_SPECIAL_BUILD
+#   define _XAI_API_      _XAI_EXTERN_C_
+#   define _XAI_API_VAR_  _XAI_API_
+#else
+#   define _XAI_API_      _XAI_EXTERN_C_ _XAI_EXPORTS_ _XAI_INLINE_
+#   define _XAI_API_VAR_  _XAI_EXTERN_C_ _XAI_EXPORTS_
+#endif
+
+/* error check levels */
+
+/* do not check arguments for errors */
+#define XAI_ERROR_LEVEL_NO_ERROR                     0
+/* call exit(-1) in case of error */
+#define XAI_ERROR_LEVEL_TERMINATE_ON_ERROR           1
+/* return corresponding error code on error without any processing (recommended)*/
+#define XAI_ERROR_LEVEL_RETURN_ON_ERROR              2
+/* capture error but attempt continue processing (dangerous!) */
+#define XAI_ERROR_LEVEL_CONTINUE_ON_ERROR            3
+/* print error message to stdout and return without any processing */
+#define XAI_ERROR_LEVEL_PRINT_ON_ERROR               4
+/* print error message but attempt continue processing (dangerous!) */
+#define XAI_ERROR_LEVEL_PRINT_AND_CONTINUE_ON_ERROR  5
+
+#ifndef XAI_ERROR_LEVEL
+#  define XAI_ERROR_LEVEL  XAI_ERROR_LEVEL_RETURN_ON_ERROR
+#endif
+#endif
diff --git a/backends/cadence/vision/third-party/libxai_common/include/xai_core.h b/backends/cadence/vision/third-party/libxai_common/include/xai_core.h
new file mode 100644
index 00000000000..010a3a48ad9
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/include/xai_core.h
@@ -0,0 +1,624 @@
+/*
+ * Copyright (c) 2013-2018 Tensilica Inc. ALL RIGHTS RESERVED.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __XAI_CORE_H__
+#define __XAI_CORE_H__
+
+/* Force-disable DRAM boundary checks so XAI kernels accept system memory pointers.
+   Required for cache-variant convolution which operates on system memory directly. */
+#ifndef SYS_MEM_TESTING
+#define SYS_MEM_TESTING 1
+#endif
+#ifndef XAI_ERROR_CHECKS_RELAXED_REF
+#define XAI_ERROR_CHECKS_RELAXED_REF 1
+#endif
+
+#include "xai_core_api.h"
+
+#if defined(_MSC_VER)
+#define isfinite  _finite
+#define __func__  __FUNCTION__
+#endif
+
+/* Linear congruential generator */
+#define RND_A      1103515245
+#define RND_LOG_M  31
+#define RND_C      12345
+#define GET_NEXT_RND(x_pr)  (((RND_A) *(x_pr) + (RND_C)) & ((unsigned int) (1 << (RND_LOG_M)) - 1))
+
+/* return 0 on success or required memory size on failure */
+_XAI_EXTERN_C_ size_t xaiFitArray_U8(const xai_pArray donor, xai_pArray rec, int width, int height, xai_bool aligned);
+_XAI_EXTERN_C_ size_t xaiFitArray_U8S16(const xai_pArray donor, xai_pArray rec, int width, int height, xai_bool aligned);
+_XAI_EXTERN_C_ size_t xaiFitArray_S16(const xai_pArray donor, xai_pArray rec, int width, int height, xai_bool aligned);
+_XAI_EXTERN_C_ size_t xaiFitTile_U8(const xai_pTile2D donor, xai_pTile2D rec, int width, int height, xai_bool aligned);
+_XAI_EXTERN_C_ size_t xaiFitTile_S16(const xai_pTile2D donor, xai_pTile2D rec, int width, int height, xai_bool aligned);
+
+#define XAI_FIT_ALIGNED  1
+#define XAI_FIT_ANY      0
+
+
+// error check macro
+#if XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_PRINT_ON_ERROR || XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_PRINT_AND_CONTINUE_ON_ERROR
+#  include <stdio.h>
+#endif
+
+#if XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_TERMINATE_ON_ERROR
+#  include <stdlib.h>
+#endif
+
+#define MARK_VAR_AS_USED(var)  (void) (var)
+
+#if (XAI_ERROR_LEVEL != XAI_ERROR_LEVEL_NO_ERROR)
+#  define XAI_ERROR_CHECKS()           XAI_ERR_TYPE __xai_local_err_code = XAI_ERR_OK;
+#  define XAI_ERROR_CHECKS_CONTINUE()
+#  define XAI_ERROR_STATUS()           __xai_local_err_code
+#else
+#  define XAI_ERROR_CHECKS()           while (0)
+#  define XAI_ERROR_CHECKS_CONTINUE()  while (0)
+#  define XAI_ERROR_STATUS()           XAI_ERR_OK
+#endif
+
+#if XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_TERMINATE_ON_ERROR
+#  define XAI_CHECK_ERROR(condition, code, ...) \
+  if (condition) {} else exit(-1)
+#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_RETURN_ON_ERROR
+#  define XAI_CHECK_ERROR(condition, code, ...) \
+  if (condition) {} else return (code)
+#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_CONTINUE_ON_ERROR
+#  define XAI_CHECK_ERROR(condition, code, ...) \
+  if (condition) {} else __xai_local_err_code = (code)
+#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_PRINT_ON_ERROR
+#  define XAI_CHECK_ERROR(condition, code, ...)                                                                                           \
+  do { if (!(condition)) { printf("%s:%d: Error #%d (%s) in function %s: ", __FILE__, __LINE__, (int) (code), xaiErrStr(code), __func__); \
+                           printf(__VA_ARGS__);                                                                                           \
+                           printf("\n");                                                                                                  \
+                           fflush(stdout); return code; } } while (0)
+#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_PRINT_AND_CONTINUE_ON_ERROR
+#  define XAI_CHECK_ERROR(condition, code, ...)                                                                                           \
+  do { if (!(condition)) { printf("%s:%d: Error #%d (%s) in function %s: ", __FILE__, __LINE__, (int) (code), xaiErrStr(code), __func__); \
+                           printf(__VA_ARGS__);                                                                                           \
+                           printf("\n");                                                                                                  \
+                           fflush(stdout); return code; } } while (0)
+#else
+#  define XAI_CHECK_ERROR(condition, code, ...)
+#endif
+
+// helper macro
+#define XAI_ARRAY_USEFUL_CAPACITY(array, ptr)  ((ptrdiff_t) XAI_ARRAY_GET_BUFF_SIZE(array) - ((uint8_t *) (ptr) - (uint8_t *) XAI_ARRAY_GET_BUFF_PTR(array)))
+
+// macro for standard array/tile checks:
+
+// check that array/tile data is placed in the DRAM
+#if XAI_EMULATE_LOCAL_RAM && __XTENSA__
+#if XCHAL_NUM_DATARAM == 2
+#define XAI_ARRAY_STARTS_IN_DRAM(t)                                                   \
+  (XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM0_VADDR) || \
+   (XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM1_VADDR)))
+#define XAI_ARRAY_ENDS_IN_DRAM(t)                                                                                                                       \
+  (XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) + XAI_ARRAY_GET_BUFF_SIZE(t) <= (((uint32_t) XCHAL_DATARAM0_VADDR) + ((uint32_t) XCHAL_DATARAM0_SIZE)) || \
+   (XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) + XAI_ARRAY_GET_BUFF_SIZE(t) <= (((uint32_t) XCHAL_DATARAM1_VADDR) + ((uint32_t) XCHAL_DATARAM1_SIZE))))
+#define XAI_TILE2D_STARTS_IN_DRAM(t)                                                   \
+  (XAI_PTR_TO_ADDR(XAI_TILE2D_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM0_VADDR) || \
+   (XAI_PTR_TO_ADDR(XAI_TILE2D_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM1_VADDR)))
+#define XAI_TILE2D_ENDS_IN_DRAM(t)                                                                                                                      \
+  (XAI_PTR_TO_ADDR(XAI_TILE2D_GET_BUFF_PTR(t)) + XAI_TILE2D_GET_BUFF_SIZE(t) <= (((uint32_t) XCHAL_DATARAM0_VADDR) + ((uint32_t) XCHAL_DATARAM0_SIZE)) || \
+   (XAI_PTR_TO_ADDR(XAI_TILE2D_GET_BUFF_PTR(t)) + XAI_TILE2D_GET_BUFF_SIZE(t) <= (((uint32_t) XCHAL_DATARAM1_VADDR) + ((uint32_t) XCHAL_DATARAM1_SIZE))))
+#elif XCHAL_NUM_DATARAM == 1
+#define XAI_ARRAY_STARTS_IN_DRAM(t) \
+  (XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM0_VADDR))
+#define XAI_ARRAY_ENDS_IN_DRAM(t) \
+  (XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) + XAI_ARRAY_GET_BUFF_SIZE(t) <= (((uint32_t) XCHAL_DATARAM0_VADDR) + ((uint32_t) XCHAL_DATARAM0_SIZE)))
+#define XAI_TILE2D_STARTS_IN_DRAM(t) \
+  (XAI_PTR_TO_ADDR(XAI_TILE2D_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM0_VADDR))
+#define XAI_TILE2D_ENDS_IN_DRAM(t) \
+  (XAI_PTR_TO_ADDR(XAI_TILE2D_GET_BUFF_PTR(t)) + XAI_TILE2D_GET_BUFF_SIZE(t) <= (((uint32_t) XCHAL_DATARAM0_VADDR) + ((uint32_t) XCHAL_DATARAM0_SIZE)))
+#endif
+
+#else //#XAI_EMULATE_LOCAL_RAM && __XTENSA__
+#define XAI_ARRAY_STARTS_IN_DRAM(t)  1
+#define XAI_ARRAY_ENDS_IN_DRAM(t)    1
+#define XAI_TILE2D_STARTS_IN_DRAM(t)   1
+#define XAI_TILE2D_ENDS_IN_DRAM(t)     1
+#endif //#XAI_EMULATE_LOCAL_RAM && __XTENSA__
+
+// check the minimal alignment requirements
+#define XAI_ARRAY_IS_WIDTH_ALIGNED(t)       ((XAI_ARRAY_GET_WIDTH(t) & (XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_ARRAY_IS_WIDTH_ALIGNED2(t)      ((XAI_ARRAY_GET_WIDTH(t) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_ARRAY_IS_WIDTH_ALIGNED_2(t)     ((XAI_ARRAY_GET_WIDTH(t) & (XCHAL_IVPN_SIMD_WIDTH / 2 - 1)) == 0)
+#define XAI_ARRAY_IS_STRIDE_ALIGNED(t)      ((XAI_ARRAY_GET_PITCH(t) & (XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_ARRAY_IS_STRIDE_ALIGNED2(t)     ((XAI_ARRAY_GET_PITCH(t) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_ARRAY_IS_STRIDE_ALIGNED_2(t)    ((XAI_ARRAY_GET_PITCH(t) & (XCHAL_IVPN_SIMD_WIDTH / 2 - 1)) == 0)
+#define XAI_ARRAY_IS_PTR_ALIGNED_NX8(t)     ((XAI_PTR_TO_ADDR(XAI_ARRAY_GET_DATA_PTR(t)) & (XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_ARRAY_IS_PTR_ALIGNED_2NX8(t)    ((XAI_PTR_TO_ADDR(XAI_ARRAY_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_ARRAY_IS_PTR_ALIGNED_NX16(t)    ((XAI_PTR_TO_ADDR(XAI_ARRAY_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_ARRAY_IS_PTR_ALIGNED_N_2X32(t)  ((XAI_PTR_TO_ADDR(XAI_ARRAY_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+
+#define XAI_ARRAY_IS_ALIGNED_NX8(t)         (XAI_ARRAY_IS_PTR_ALIGNED_NX8(t) && XAI_ARRAY_IS_WIDTH_ALIGNED(t) && XAI_ARRAY_IS_STRIDE_ALIGNED(t))
+#define XAI_ARRAY_IS_ALIGNED_2NX8(t)        (XAI_ARRAY_IS_PTR_ALIGNED_2NX8(t) && XAI_ARRAY_IS_WIDTH_ALIGNED2(t) && XAI_ARRAY_IS_STRIDE_ALIGNED2(t))
+#define XAI_ARRAY_IS_ALIGNED_NX16(t)        (XAI_ARRAY_IS_PTR_ALIGNED_NX16(t) && XAI_ARRAY_IS_WIDTH_ALIGNED(t) && XAI_ARRAY_IS_STRIDE_ALIGNED(t))
+#define XAI_ARRAY_IS_ALIGNED_N_2X32(t)      (XAI_ARRAY_IS_PTR_ALIGNED_N_2X32(t) && XAI_ARRAY_IS_WIDTH_ALIGNED_2(t) && XAI_ARRAY_IS_STRIDE_ALIGNED_2(t))
+
+#define XAI_TILE2D_IS_WIDTH_ALIGNED(t)        XAI_ARRAY_IS_WIDTH_ALIGNED(t)
+#define XAI_TILE2D_IS_WIDTH_ALIGNED2(t)       XAI_ARRAY_IS_WIDTH_ALIGNED2(t)
+#define XAI_TILE2D_IS_WIDTH_ALIGNED_2(t)      XAI_ARRAY_IS_WIDTH_ALIGNED_2(t)
+#define XAI_TILE2D_IS_STRIDE_ALIGNED(t)       XAI_ARRAY_IS_STRIDE_ALIGNED(t)
+#define XAI_TILE2D_IS_STRIDE_ALIGNED2(t)      XAI_ARRAY_IS_STRIDE_ALIGNED2(t)
+#define XAI_TILE2D_IS_STRIDE_ALIGNED_2(t)     XAI_ARRAY_IS_STRIDE_ALIGNED_2(t)
+#define XAI_TILE2D_IS_PTR_ALIGNED_NX8(t)      XAI_ARRAY_IS_PTR_ALIGNED_NX8(t)
+#define XAI_TILE2D_IS_PTR_ALIGNED_2NX8(t)     XAI_ARRAY_IS_PTR_ALIGNED_2NX8(t)
+#define XAI_TILE2D_IS_PTR_ALIGNED_NX16(t)     XAI_ARRAY_IS_PTR_ALIGNED_NX16(t)
+#define XAI_TILE2D_IS_PTR_ALIGNED_N_2X32(t)   XAI_ARRAY_IS_PTR_ALIGNED_N_2X32(t)
+
+// check array invariants
+#define XAI_ARRAY_IS_1D(t)                     (XAI_ARRAY_GET_HEIGHT(t) == 1)
+
+#define XAI_ARRAY_CHECK_TYPE(a, type)          (XAI_TYPE_ELEMENT_TYPE(XAI_ARRAY_GET_TYPE(a)) == type)
+
+#define XAI_ARRAY_CHECK_ELEMENT_SIZE(a, size)  (XAI_ARRAY_GET_ELEMENT_SIZE(a) == (size))
+
+#define XAI_ARRAY_SIZE_EQ(t1, t2)              (XAI_ARRAY_GET_WIDTH(t1) == XAI_ARRAY_GET_WIDTH(t2) && XAI_ARRAY_GET_HEIGHT(t1) == XAI_ARRAY_GET_HEIGHT(t2))
+
+#define XAI_ARRAY_SIZE_GEQ(t1, t2)             (XAI_ARRAY_GET_WIDTH(t1) >= XAI_ARRAY_GET_WIDTH(t2) && XAI_ARRAY_GET_HEIGHT(t1) >= XAI_ARRAY_GET_HEIGHT(t2))
+
+#define XAI_ARRAYS_ARE_NOT_OVERLAP(t1, t2)     (XAI_ARRAY_GET_DATA_PTR(t1) != XAI_ARRAY_GET_DATA_PTR(t2))
+
+#define XAI_ARRAY_IS_CONSISTENT(a)                                                                                                                            \
+  ((XAI_ARRAY_GET_PITCH(a) >= XAI_ARRAY_GET_WIDTH(a)) &&                                                                                                      \
+   (XAI_ARRAY_GET_WIDTH(a) > 0) && (XAI_ARRAY_GET_HEIGHT(a) > 0) &&                                                                                           \
+   ((uint8_t *) XAI_ARRAY_GET_DATA_PTR(a) >= (uint8_t *) XAI_ARRAY_GET_BUFF_PTR(a)) &&                                                                        \
+   ((uint8_t *) XAI_ARRAY_GET_DATA_PTR(a) + (XAI_ARRAY_GET_PITCH(a) * (XAI_ARRAY_GET_HEIGHT(a) - 1) + XAI_ARRAY_GET_WIDTH(a)) * XAI_ARRAY_GET_ELEMENT_SIZE(a) \
+    <= (uint8_t *) XAI_ARRAY_GET_BUFF_PTR(a) + XAI_ARRAY_GET_BUFF_SIZE(a)))
+
+// common array error checks
+#define XAI_CHECK_POINTER(pointer) \
+  XAI_CHECK_ERROR(pointer != 0, XAI_ERR_NULLARG, "The pointer (" #pointer ") is NULL")
+
+#if ((defined(XCHAL_VISION_TYPE) && (XCHAL_VISION_TYPE >= 6)) || (defined(XCHAL_HAVE_BBENEP) && (XCHAL_HAVE_BBENEP == 1)))
+
+#define XAI_CHECK_BUFFER(array)                                                                                                \
+  XAI_CHECK_POINTER(array);                                                                                                    \
+  XAI_CHECK_ERROR(XAI_ARRAY_STARTS_IN_DRAM(array), XAI_ERR_MEMLOCAL, "The argument (" #array ") data does not start in DRAM"); \
+  XAI_CHECK_ERROR(XAI_ARRAY_ENDS_IN_DRAM(array), XAI_ERR_MEMLOCAL, "Complete data for the argument  (" #array ")  does not lie in DRAM")
+
+#else
+
+#define XAI_CHECK_BUFFER(array) \
+  XAI_CHECK_POINTER(array);
+#endif
+
+#define XAI_CHECK_ARRAY(array) \
+  XAI_CHECK_BUFFER(array);     \
+  XAI_CHECK_ERROR(XAI_ARRAY_IS_CONSISTENT(array), XAI_ERR_BADARG, "The argument (" #array ") is invalid")
+
+#define XAI_CHECK_ARRAY_I(array, element_size)                           \
+  XAI_CHECK_ARRAY(array);                                                \
+  XAI_CHECK_ERROR(XAI_ARRAY_CHECK_ELEMENT_SIZE(array, element_size) &&   \
+                  !((XAI_ARRAY_GET_TYPE(array)) & (XAI_TYPE_FLOAT_BIT)), \
+                  XAI_ERR_DATATYPE, "The argument (" #array ") has wrong type")
+
+#define XAI_CHECK_ARRAY_X(array, element_size)                       \
+  XAI_CHECK_ARRAY(array);                                            \
+  XAI_CHECK_ERROR(XAI_ARRAY_CHECK_ELEMENT_SIZE(array, element_size), \
+                  XAI_ERR_DATATYPE, "The argument (" #array ") has wrong type")
+
+#define XAI_CHECK_ARRAY_T(array, type) \
+  XAI_CHECK_ARRAY(array);              \
+  XAI_CHECK_ERROR(XAI_ARRAY_CHECK_TYPE(array, type), XAI_ERR_DATATYPE, "The argument (" #array ") has wrong type")
+
+#define XAI_CHECK_ARRAY_I8(array)   XAI_CHECK_ARRAY_I(array, sizeof(int8_t))
+#define XAI_CHECK_ARRAY_I16(array)  XAI_CHECK_ARRAY_I(array, sizeof(int16_t))
+#define XAI_CHECK_ARRAY_I32(array)  XAI_CHECK_ARRAY_I(array, sizeof(int32_t))
+
+#define XAI_CHECK_ARRAY_X16(array)  XAI_CHECK_ARRAY_X(array, sizeof(int16_t))
+#define XAI_CHECK_ARRAY_X32(array)  XAI_CHECK_ARRAY_X(array, sizeof(int32_t))
+
+#define XAI_CHECK_ARRAY_U8(array)   XAI_CHECK_ARRAY_T(array, XAI_U8)
+#define XAI_CHECK_ARRAY_S8(array)   XAI_CHECK_ARRAY_T(array, XAI_S8)
+#define XAI_CHECK_ARRAY_U16(array)  XAI_CHECK_ARRAY_T(array, XAI_U16)
+#define XAI_CHECK_ARRAY_S16(array)  XAI_CHECK_ARRAY_T(array, XAI_S16)
+#define XAI_CHECK_ARRAY_U32(array)  XAI_CHECK_ARRAY_T(array, XAI_U32)
+#define XAI_CHECK_ARRAY_S32(array)  XAI_CHECK_ARRAY_T(array, XAI_S32)
+#define XAI_CHECK_ARRAY_S64(array)  XAI_CHECK_ARRAY_T(array, XAI_S64)
+#define XAI_CHECK_ARRAY_F16(array)  XAI_CHECK_ARRAY_T(array, XAI_F16)
+#define XAI_CHECK_ARRAY_F32(array)  XAI_CHECK_ARRAY_T(array, XAI_F32)
+
+#define XAI_CHECK_ARRAY_IS_1D(array) \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(array) == 1, XAI_ERR_BADARG, "The argument (" #array ") must be a 1D array")
+
+#define XAI_CHECK_ARRAYS_ARE_NOT_OVERLAP(array0, array1) \
+  XAI_CHECK_ERROR(XAI_ARRAYS_ARE_NOT_OVERLAP(array0, array1), XAI_ERR_INPLACE, "Inplace operation is not supported")
+
+#define XAI_CHECK_ARRAY_ELEMENT_SIZE_EQ(array0, array1)                                     \
+  XAI_CHECK_ERROR(XAI_ARRAY_GET_ELEMENT_SIZE(array0) == XAI_ARRAY_GET_ELEMENT_SIZE(array1), \
+                  XAI_ERR_DATATYPE, "The (" #array0 ") element size must be equal to the (" #array1 ") element size")
+
+#define XAI_CHECK_ARRAY_SIZE_EQ(array0, array1) \
+  XAI_CHECK_ERROR(XAI_ARRAY_SIZE_EQ(array0, array1), XAI_ERR_DATASIZE, "The (" #array0 ") argument size is not equal to the (" #array1 ") argument size")
+
+#define XAI_CHECK_ARRAY_SIZE_GEQ(array0, array1) \
+  XAI_CHECK_ERROR(XAI_ARRAY_SIZE_GEQ(array0, array1), XAI_ERR_DATASIZE, "The (" #array0 ") argument size is not equal to OR greater than the (" #array1 ") argument size")
+
+#define XAI_CHECK_ARRAY_ALIGNMENT(array, DEPTH, ERR) \
+  XAI_CHECK_ERROR(XAI_ARRAY_IS_ALIGNED_ ## DEPTH(array), XAI_ERR_ ## ERR, "The argument (" #array ") is not fully aligned")
+
+#define XAI_CHECK_ARRAY_IALIGNMENT_NX8(array)     XAI_CHECK_ARRAY_ALIGNMENT(array, NX8, IALIGNMENT)
+#define XAI_CHECK_ARRAY_IALIGNMENT_2NX8(array)    XAI_CHECK_ARRAY_ALIGNMENT(array, 2NX8, IALIGNMENT)
+#define XAI_CHECK_ARRAY_IALIGNMENT_NX16(array)    XAI_CHECK_ARRAY_ALIGNMENT(array, NX16, IALIGNMENT)
+#define XAI_CHECK_ARRAY_IALIGNMENT_N_2X32(array)  XAI_CHECK_ARRAY_ALIGNMENT(array, N_2X32, IALIGNMENT)
+#define XAI_CHECK_ARRAY_OALIGNMENT_NX8(array)     XAI_CHECK_ARRAY_ALIGNMENT(array, NX8, OALIGNMENT)
+#define XAI_CHECK_ARRAY_OALIGNMENT_2NX8(array)    XAI_CHECK_ARRAY_ALIGNMENT(array, 2NX8, OALIGNMENT)
+#define XAI_CHECK_ARRAY_OALIGNMENT_NX16(array)    XAI_CHECK_ARRAY_ALIGNMENT(array, NX16, OALIGNMENT)
+#define XAI_CHECK_ARRAY_OALIGNMENT_N_2X32(array)  XAI_CHECK_ARRAY_ALIGNMENT(array, N_2X32, OALIGNMENT)
+
+
+// check tile invariants
+#define XAI_TILE2D_IS_CONSISTENT(t)                                                                                                                                                                                   \
+  ((XAI_TILE2D_GET_PITCH(t) >= XAI_TILE2D_GET_WIDTH(t) + XAI_TILE2D_GET_EDGE_WIDTH(t) * 2) &&                                                                                                                             \
+   ((uint8_t *) XAI_TILE2D_GET_DATA_PTR(t) - (XAI_TILE2D_GET_EDGE_WIDTH(t) + XAI_TILE2D_GET_PITCH(t) * XAI_TILE2D_GET_EDGE_HEIGHT(t)) * XAI_TILE2D_GET_ELEMENT_SIZE(t)                                                        \
+    >= (uint8_t *) XAI_TILE2D_GET_BUFF_PTR(t)) &&                                                                                                                                                                     \
+   ((uint8_t *) XAI_TILE2D_GET_DATA_PTR(t) + (XAI_TILE2D_GET_PITCH(t) * (XAI_TILE2D_GET_HEIGHT(t) + XAI_TILE2D_GET_EDGE_HEIGHT(t) - 1) + XAI_TILE2D_GET_WIDTH(t) + XAI_TILE2D_GET_EDGE_WIDTH(t)) * XAI_TILE2D_GET_ELEMENT_SIZE(t) \
+    <= (uint8_t *) XAI_TILE2D_GET_BUFF_PTR(t) + XAI_TILE2D_GET_BUFF_SIZE(t)))
+
+// common tile error checks
+#define XAI_CHECK_TILE2D(tile)                                                                                                \
+  XAI_CHECK_POINTER(tile);                                                                                                  \
+  XAI_CHECK_ERROR(XAI_TILE2D_IS_CONSISTENT(tile), XAI_ERR_BADARG, "The argument (" #tile ") is invalid");                     \
+  XAI_CHECK_ERROR(XAI_TILE2D_IS_TILE2D(tile), XAI_ERR_BADARG, "The argument (" #tile ") is not a tile");                        \
+  XAI_CHECK_ERROR(XAI_TILE2D_STARTS_IN_DRAM(tile), XAI_ERR_MEMLOCAL, "The argument (" #tile ") data does not start in DRAM"); \
+  XAI_CHECK_ERROR(XAI_TILE2D_ENDS_IN_DRAM(tile), XAI_ERR_MEMLOCAL, "Complete data for the argument  (" #tile ")  does not lie in DRAM")
+
+#define XAI_TILE2D_CHECK_TYPE(a, type) \
+  ((XAI_TYPE_ELEMENT_TYPE(XAI_TILE2D_GET_TYPE(a)) == type) && (XAI_TILE2D_IS_TILE2D(a)))
+
+#define XAI_CHECK_TILE2D_I(tile, element_size)                        \
+  XAI_CHECK_TILE2D(tile);                                             \
+  XAI_CHECK_ERROR(XAI_ARRAY_CHECK_ELEMENT_SIZE(tile, element_size), \
+                  XAI_ERR_DATATYPE, "The argument (" #tile ") has wrong type")
+
+#define XAI_CHECK_TILE2D_T(tile, type) \
+  XAI_CHECK_TILE2D(tile);              \
+  XAI_CHECK_ERROR(XAI_TILE2D_CHECK_TYPE(tile, type), XAI_ERR_DATATYPE, "The argument (" #tile ") has wrong type")
+
+#define XAI_CHECK_TILE2D_I8(array)   XAI_CHECK_TILE2D_I(array, sizeof(int8_t))
+#define XAI_CHECK_TILE2D_I16(array)  XAI_CHECK_TILE2D_I(array, sizeof(int16_t))
+#define XAI_CHECK_TILE2D_I32(array)  XAI_CHECK_TILE2D_I(array, sizeof(int32_t))
+
+#define XAI_CHECK_TILE2D_U8(array)   XAI_CHECK_TILE2D_T(array, XAI_U8)
+#define XAI_CHECK_TILE2D_S8(array)   XAI_CHECK_TILE2D_T(array, XAI_S8)
+#define XAI_CHECK_TILE2D_U16(array)  XAI_CHECK_TILE2D_T(array, XAI_U16)
+#define XAI_CHECK_TILE2D_S16(array)  XAI_CHECK_TILE2D_T(array, XAI_S16)
+#define XAI_CHECK_TILE2D_U32(array)  XAI_CHECK_TILE2D_T(array, XAI_U32)
+#define XAI_CHECK_TILE2D_S32(array)  XAI_CHECK_TILE2D_T(array, XAI_S32)
+
+#define XAI_CHECK_TILE2D_EDGE(tile, edge)                                                            \
+  XAI_CHECK_ERROR(XAI_TILE2D_GET_EDGE_WIDTH(tile) >= edge && XAI_TILE2D_GET_EDGE_HEIGHT(tile) >= edge, \
+                  XAI_ERR_EDGE, "The (" #tile ") tile must have at least " #edge "-pixel edge extension")
+
+#define XAI_CHECK_TILES_ARE_NOT_OVERLAP(tile0, tile1)  XAI_CHECK_ARRAYS_ARE_NOT_OVERLAP(tile0, tile1)
+
+#define XAI_CHECK_TILE2D_IALIGNMENT_NX8(tile)            XAI_CHECK_ARRAY_IALIGNMENT_NX8(tile)
+#define XAI_CHECK_TILE2D_IALIGNMENT_2NX8(tile)           XAI_CHECK_ARRAY_IALIGNMENT_2NX8(tile)
+#define XAI_CHECK_TILE2D_IALIGNMENT_NX16(tile)           XAI_CHECK_ARRAY_IALIGNMENT_NX16(tile)
+#define XAI_CHECK_TILE2D_IALIGNMENT_N_2X32(tile)         XAI_CHECK_ARRAY_IALIGNMENT_N_2X32(tile)
+#define XAI_CHECK_TILE2D_OALIGNMENT_NX8(tile)            XAI_CHECK_ARRAY_OALIGNMENT_NX8(tile)
+#define XAI_CHECK_TILE2D_OALIGNMENT_2NX8(tile)           XAI_CHECK_ARRAY_OALIGNMENT_2NX8(tile)
+#define XAI_CHECK_TILE2D_OALIGNMENT_NX16(tile)           XAI_CHECK_ARRAY_OALIGNMENT_NX16(tile)
+#define XAI_CHECK_TILE2D_OALIGNMENT_N_2X32(tile)         XAI_CHECK_ARRAY_OALIGNMENT_N_2X32(tile)
+
+// Checks for confinement of 3D and 4D tiles in single DRAM
+#if XAI_EMULATE_LOCAL_RAM && __XTENSA__ && !SYS_MEM_TESTING
+#if XCHAL_NUM_DATARAM == 2
+#define XAI_ARRAY_START_AND_END_IN_SINGLE_DRAM(t)                                        \
+  ((XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM0_VADDR) &&   \
+    XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) + XAI_ARRAY_GET_BUFF_SIZE(t)              \
+    <= (((uint32_t) XCHAL_DATARAM0_VADDR) + ((uint32_t) XCHAL_DATARAM0_SIZE))) ||        \
+   ((XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM1_VADDR)) && \
+    (XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) + XAI_ARRAY_GET_BUFF_SIZE(t)             \
+     <= (((uint32_t) XCHAL_DATARAM1_VADDR) + ((uint32_t) XCHAL_DATARAM1_SIZE)))))
+#elif XCHAL_NUM_DATARAM == 1
+#define XAI_ARRAY_START_AND_END_IN_SINGLE_DRAM(t)                                     \
+  (XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM0_VADDR) && \
+   XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) + XAI_ARRAY_GET_BUFF_SIZE(t)            \
+   <= (((uint32_t) XCHAL_DATARAM0_VADDR) + ((uint32_t) XCHAL_DATARAM0_SIZE)))
+#endif
+#define XAI_TILE3D_START_AND_END_IN_SINGLE_DRAM(t)  XAI_ARRAY_START_AND_END_IN_SINGLE_DRAM(t)
+#define XAI_TILE4D_START_AND_END_IN_SINGLE_DRAM(t)  XAI_ARRAY_START_AND_END_IN_SINGLE_DRAM(t)
+#else
+#define XAI_TILE3D_START_AND_END_IN_SINGLE_DRAM(t)  1
+#define XAI_TILE4D_START_AND_END_IN_SINGLE_DRAM(t)  1
+#define XAI_ARRAY_START_AND_END_IN_SINGLE_DRAM(t)   1
+#endif
+
+#define XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(t)                                 \
+  XAI_CHECK_ERROR(XAI_TILE3D_START_AND_END_IN_SINGLE_DRAM(t), XAI_ERR_MEMLOCAL, \
+                  "Complete data for the argument  (" #t ")  does not fit in single DRAM");
+
+#define XAI_CHECK_TILE4D_FITS_IN_SINGLE_DRAM(t)                                 \
+  XAI_CHECK_ERROR(XAI_TILE4D_START_AND_END_IN_SINGLE_DRAM(t), XAI_ERR_MEMLOCAL, \
+                  "Complete data for the argument  (" #t ")  does not fit in single DRAM");
+
+#define XAI_CHECK_ARRAY_FITS_IN_SINGLE_DRAM(parray)                                 \
+  XAI_CHECK_ERROR(XAI_ARRAY_START_AND_END_IN_SINGLE_DRAM(parray), XAI_ERR_MEMLOCAL, \
+                  "Complete data for the argument  (" #parray ")  does not fit in single DRAM");
+
+#define XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(tile)                                                                             \
+  XAI_CHECK_ERROR(XAI_TILE2D_STARTS_IN_DRAM(tile), XAI_ERR_MEMLOCAL, "The argument (" #tile ") data does not start in DRAM"); \
+  XAI_CHECK_ERROR(XAI_TILE2D_ENDS_IN_DRAM(tile), XAI_ERR_MEMLOCAL, "Complete data for the argument  (" #tile ")  does not lie in DRAM");
+
+#define XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(tile)  XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(tile)
+
+
+// Checks for tile consistency
+#define XAI_TILE3D_IS_CONSISTENT(t)                                                                                                                                                        \
+  ((uint8_t *) XAI_TILE3D_GET_DATA_PTR(t) - (XAI_TILE3D_GET_DIM1_EDGE1(t) + XAI_TILE3D_GET_DIM1_PITCH(t) * XAI_TILE3D_GET_DIM2_EDGE1(t)                                                    \
+                                             + XAI_TILE3D_GET_DIM2_PITCH(t) * XAI_TILE3D_GET_DIM3_EDGE1(t)) * XAI_TILE3D_GET_ELEMENT_SIZE(t) >= (uint8_t *) XAI_TILE3D_GET_BUFF_PTR(t)) && \
+  ((uint8_t *) XAI_TILE3D_GET_DATA_PTR(t) + (XAI_TILE3D_GET_DIM2_PITCH(t) * (XAI_TILE3D_GET_DIM3(t) + XAI_TILE3D_GET_DIM3_EDGE2(t) - 1)                                                    \
+                                             + XAI_TILE3D_GET_DIM1_PITCH(t) * (XAI_TILE3D_GET_DIM2(t) + XAI_TILE3D_GET_DIM2_EDGE2(t) - 1)                                                  \
+                                             + XAI_TILE3D_GET_DIM1(t) + XAI_TILE3D_GET_DIM1_EDGE2(t)) * XAI_TILE3D_GET_ELEMENT_SIZE(t)                                                     \
+   <= (uint8_t *) XAI_TILE3D_GET_BUFF_PTR(t) + XAI_TILE3D_GET_BUFF_SIZE(t)) &&                                                                                                             \
+  (XAI_TILE3D_GET_BUFF_SIZE(t) != 0) &&                                                                                                                                                    \
+  (XAI_TILE3D_GET_DIM1(t) > 0) && (XAI_TILE3D_GET_DIM2(t) > 0) && (XAI_TILE3D_GET_DIM3(t) > 0) &&                                                                                          \
+  (XAI_TILE3D_GET_DIM1_PITCH(t) >= XAI_TILE3D_GET_DIM1(t) + XAI_TILE3D_GET_DIM1_EDGE1(t) + XAI_TILE3D_GET_DIM1_EDGE2(t))
+
+#define XAI_TILE4D_IS_CONSISTENT(t)                                                                                                                                                        \
+  ((uint8_t *) XAI_TILE4D_GET_DATA_PTR(t) - (XAI_TILE4D_GET_DIM1_EDGE1(t) + XAI_TILE4D_GET_DIM1_PITCH(t) * XAI_TILE4D_GET_DIM2_EDGE1(t)                                                    \
+                                             + XAI_TILE4D_GET_DIM2_PITCH(t) * XAI_TILE4D_GET_DIM3_EDGE1(t)) * XAI_TILE4D_GET_ELEMENT_SIZE(t) >= (uint8_t *) XAI_TILE4D_GET_BUFF_PTR(t)) && \
+  ((uint8_t *) XAI_TILE4D_GET_DATA_PTR(t) + (XAI_TILE4D_GET_DIM3_PITCH(t) * (XAI_TILE4D_GET_DIM4(t) - 1)                                                                                   \
+                                             + XAI_TILE4D_GET_DIM2_PITCH(t) * (XAI_TILE4D_GET_DIM3(t) + XAI_TILE4D_GET_DIM3_EDGE2(t) - 1)                                                  \
+                                             + XAI_TILE4D_GET_DIM1_PITCH(t) * (XAI_TILE4D_GET_DIM2(t) + XAI_TILE4D_GET_DIM2_EDGE2(t) - 1)                                                  \
+                                             + XAI_TILE4D_GET_DIM1(t) + XAI_TILE4D_GET_DIM1_EDGE2(t)) * XAI_TILE4D_GET_ELEMENT_SIZE(t)                                                     \
+   <= (uint8_t *) XAI_TILE4D_GET_BUFF_PTR(t) + XAI_TILE4D_GET_BUFF_SIZE(t)) &&                                                                                                             \
+  (XAI_TILE4D_GET_BUFF_SIZE(t) != 0) &&                                                                                                                                                    \
+  (XAI_TILE4D_GET_DIM1(t) > 0) && (XAI_TILE4D_GET_DIM2(t) > 0) && (XAI_TILE4D_GET_DIM3(t) > 0) && (XAI_TILE4D_GET_DIM4(t) > 0) &&                                                          \
+  (XAI_TILE4D_GET_DIM1_PITCH(t) >= XAI_TILE4D_GET_DIM1(t) + XAI_TILE4D_GET_DIM1_EDGE1(t) + XAI_TILE4D_GET_DIM1_EDGE2(t))
+
+#define XAI_TILE3D_SIZE_EQ(t1, t2)                                                                             \
+  (XAI_TILE3D_GET_DIM1(t1) == XAI_TILE3D_GET_DIM1(t2) && XAI_TILE3D_GET_DIM2(t1) == XAI_TILE3D_GET_DIM2(t2) && \
+   XAI_TILE3D_GET_DIM3(t1) == XAI_TILE3D_GET_DIM3(t2))
+
+#define XAI_TILE3D_PITCH_EQ(t1, t2)                                    \
+  (XAI_FRAME3D_GET_DIM1_PITCH(t1) == XAI_FRAME3D_GET_DIM1_PITCH(t2) && \
+   XAI_FRAME3D_GET_DIM2_PITCH(t1) == XAI_FRAME3D_GET_DIM2_PITCH(t2))
+
+// common tile error checks
+#define XAI_CHECK_TILE3D(tile)                                                                            \
+  XAI_CHECK_POINTER(tile);                                                                                \
+  XAI_CHECK_ERROR(XAI_TILE3D_IS_CONSISTENT(tile), XAI_ERR_BADARG, "The argument (" #tile ") is invalid"); \
+  XAI_CHECK_ERROR(XAI_TYPE_IS_TILE3D(XAI_TILE3D_GET_TYPE(tile)), XAI_ERR_BADARG, "The argument (" #tile ") is not a tile");
+
+
+#define XAI_TILE3D_CHECK_TYPE(a, type) \
+  ((XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(a)) == type) && (XAI_TYPE_IS_TILE3D(XAI_TILE3D_GET_TYPE(a))))
+
+#define XAI_TILE3D_CHECK_ELEMENT_SIZE(a, size)  (XAI_ARRAY_GET_ELEMENT_SIZE(a) == (size))
+
+#define XAI_CHECK_TILE3D_SIZE_EQ(t1, t2)                                                                           \
+  XAI_CHECK_ERROR(XAI_TILE3D_SIZE_EQ(t1, t2), XAI_ERR_DATASIZE, "Size of the ("#t1 ") and ("#t2 ") are not same"); \
+  if (XAI_TILE3D_GET_DATA_PTR(t1) == XAI_TILE3D_GET_DATA_PTR(t2))                                                  \
+  {                                                                                                                \
+    XAI_CHECK_ERROR(XAI_TILE3D_PITCH_EQ(t1, t2), XAI_ERR_INPLACE, "Inplace operation not supported when pitch of " \
+                    "("#t1 ") and ("#t2 ") are not same");                                                         \
+  }
+
+#define XAI_CHECK_TILE3D_I(tile, element_size)                           \
+  XAI_CHECK_TILE3D(tile);                                                \
+  XAI_CHECK_ERROR(XAI_TILE3D_CHECK_ELEMENT_SIZE(tile, element_size) &&   \
+                  !((XAI_TILE3D_GET_TYPE(tile)) & (XAI_TYPE_FLOAT_BIT)), \
+                  XAI_ERR_DATATYPE, "The argument (" #tile ") has wrong type")
+
+#define XAI_CHECK_TILE3D_X(tile, element_size)                       \
+  XAI_CHECK_TILE3D(tile);                                            \
+  XAI_CHECK_ERROR(XAI_TILE3D_CHECK_ELEMENT_SIZE(tile, element_size), \
+                  XAI_ERR_DATATYPE, "The argument (" #tile ") has wrong type")
+
+#define XAI_CHECK_TILE3D_T(tile, type) \
+  XAI_CHECK_TILE3D(tile);              \
+  XAI_CHECK_ERROR(XAI_TILE3D_CHECK_TYPE(tile, type), XAI_ERR_DATATYPE, "The argument (" #tile ") has wrong type")
+
+#define XAI_CHECK_TILE3D_I8(array)   XAI_CHECK_TILE3D_I(array, sizeof(int8_t))
+#define XAI_CHECK_TILE3D_I16(array)  XAI_CHECK_TILE3D_I(array, sizeof(int16_t))
+#define XAI_CHECK_TILE3D_I32(array)  XAI_CHECK_TILE3D_I(array, sizeof(int32_t))
+#define XAI_CHECK_TILE3D_I64(array)  XAI_CHECK_TILE3D_I(array, sizeof(int64_t))
+
+#define XAI_CHECK_TILE3D_X16(array)  XAI_CHECK_TILE3D_X(array, sizeof(int16_t))
+#define XAI_CHECK_TILE3D_X32(array)  XAI_CHECK_TILE3D_X(array, sizeof(int32_t))
+
+#define XAI_CHECK_TILE3D_U8(array)   XAI_CHECK_TILE3D_T(array, XAI_U8)
+#define XAI_CHECK_TILE3D_S8(array)   XAI_CHECK_TILE3D_T(array, XAI_S8)
+#define XAI_CHECK_TILE3D_U16(array)  XAI_CHECK_TILE3D_T(array, XAI_U16)
+#define XAI_CHECK_TILE3D_S16(array)  XAI_CHECK_TILE3D_T(array, XAI_S16)
+#define XAI_CHECK_TILE3D_U32(array)  XAI_CHECK_TILE3D_T(array, XAI_U32)
+#define XAI_CHECK_TILE3D_S32(array)  XAI_CHECK_TILE3D_T(array, XAI_S32)
+#define XAI_CHECK_TILE3D_S64(array)  XAI_CHECK_TILE3D_T(array, XAI_S64)
+#define XAI_CHECK_TILE3D_F16(array)  XAI_CHECK_TILE3D_T(array, XAI_F16)
+#define XAI_CHECK_TILE3D_F32(array)  XAI_CHECK_TILE3D_T(array, XAI_F32)
+
+// checks for 4D tiles
+#define XAI_CHECK_TILE4D(tile)                                                                            \
+  XAI_CHECK_POINTER(tile);                                                                                \
+  XAI_CHECK_ERROR(XAI_TILE4D_IS_CONSISTENT(tile), XAI_ERR_BADARG, "The argument (" #tile ") is invalid"); \
+  XAI_CHECK_ERROR(XAI_TYPE_IS_TILE4D(XAI_TILE4D_GET_TYPE(tile)), XAI_ERR_BADARG, "The argument (" #tile ") is not a tile");
+
+#define XAI_TILE4D_SIZE_EQ(t1, t2)                                                                             \
+  (XAI_TILE4D_GET_DIM1(t1) == XAI_TILE4D_GET_DIM1(t2) && XAI_TILE4D_GET_DIM2(t1) == XAI_TILE4D_GET_DIM2(t2) && \
+   XAI_TILE4D_GET_DIM3(t1) == XAI_TILE4D_GET_DIM3(t2) && XAI_TILE4D_GET_DIM4(t1) == XAI_TILE4D_GET_DIM4(t2))
+
+#define XAI_TILE4D_CHECK_TYPE(a, type) \
+  ((XAI_TYPE_ELEMENT_TYPE(XAI_TILE4D_GET_TYPE(a)) == type) && (XAI_TYPE_IS_TILE4D(XAI_TILE4D_GET_TYPE(a))))
+
+#define XAI_TILE4D_CHECK_ELEMENT_SIZE(a, size)  (XAI_ARRAY_GET_ELEMENT_SIZE(a) == (size))
+
+#define XAI_CHECK_TILE4D_I(tile, element_size)                           \
+  XAI_CHECK_TILE4D(tile);                                                \
+  XAI_CHECK_ERROR(XAI_TILE4D_CHECK_ELEMENT_SIZE(tile, element_size) &&   \
+                  !((XAI_TILE4D_GET_TYPE(tile)) & (XAI_TYPE_FLOAT_BIT)), \
+                  XAI_ERR_DATATYPE, "The argument (" #tile ") has wrong type")
+
+#define XAI_CHECK_TILE4D_X(tile, element_size)                       \
+  XAI_CHECK_TILE4D(tile);                                            \
+  XAI_CHECK_ERROR(XAI_TILE4D_CHECK_ELEMENT_SIZE(tile, element_size), \
+                  XAI_ERR_DATATYPE, "The argument (" #tile ") has wrong type")
+
+#define XAI_CHECK_TILE4D_T(tile, type) \
+  XAI_CHECK_TILE4D(tile);              \
+  XAI_CHECK_ERROR(XAI_TILE4D_CHECK_TYPE(tile, type), XAI_ERR_DATATYPE, "The argument (" #tile ") has wrong type")
+
+#define XAI_CHECK_TILE4D_SIZE_EQ(t1, t2) \
+  XAI_CHECK_ERROR(XAI_TILE4D_SIZE_EQ(t1, t2), XAI_ERR_DATASIZE, "Size of the ("#t1 ") and ("#t2 ") is not same")
+
+#define XAI_CHECK_TILE4D_I8(array)   XAI_CHECK_TILE4D_I(array, sizeof(int8_t))
+#define XAI_CHECK_TILE4D_I16(array)  XAI_CHECK_TILE4D_I(array, sizeof(int16_t))
+#define XAI_CHECK_TILE4D_I32(array)  XAI_CHECK_TILE4D_I(array, sizeof(int32_t))
+
+#define XAI_CHECK_TILE4D_X16(array)  XAI_CHECK_TILE4D_X(array, sizeof(int16_t))
+#define XAI_CHECK_TILE4D_X32(array)  XAI_CHECK_TILE4D_X(array, sizeof(int32_t))
+
+#define XAI_CHECK_TILE4D_U8(array)   XAI_CHECK_TILE4D_T(array, XAI_U8)
+#define XAI_CHECK_TILE4D_S8(array)   XAI_CHECK_TILE4D_T(array, XAI_S8)
+#define XAI_CHECK_TILE4D_U16(array)  XAI_CHECK_TILE4D_T(array, XAI_U16)
+#define XAI_CHECK_TILE4D_S16(array)  XAI_CHECK_TILE4D_T(array, XAI_S16)
+#define XAI_CHECK_TILE4D_F16(array)  XAI_CHECK_TILE4D_T(array, XAI_F16)
+#define XAI_CHECK_TILE4D_U32(array)  XAI_CHECK_TILE4D_T(array, XAI_U32)
+#define XAI_CHECK_TILE4D_S32(array)  XAI_CHECK_TILE4D_T(array, XAI_S32)
+#define XAI_CHECK_TILE4D_F32(array)  XAI_CHECK_TILE4D_T(array, XAI_F32)
+
+// check the minimal alignment requirements for 3D tile
+#define XAI_TILE3D_IS_STRIDE_ALIGNED(t)      ((XAI_TILE3D_GET_DIM1_PITCH(t) & (XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_TILE3D_IS_STRIDE_ALIGNED2(t)     ((XAI_TILE3D_GET_DIM1_PITCH(t) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_TILE3D_IS_STRIDE_ALIGNED_2(t)    ((XAI_TILE3D_GET_DIM1_PITCH(t) & (XCHAL_IVPN_SIMD_WIDTH / 2 - 1)) == 0)
+#define XAI_TILE3D_IS_STRIDE_ALIGNED_4B(t)   ((XAI_TILE3D_GET_DIM1_PITCH(t) & (3)) == 0)
+#define XAI_TILE3D_IS_PTR_ALIGNED_NX8(t)     ((XAI_PTR_TO_ADDR(XAI_TILE3D_GET_DATA_PTR(t)) & (XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_TILE3D_IS_PTR_ALIGNED_2NX8(t)    ((XAI_PTR_TO_ADDR(XAI_TILE3D_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_TILE3D_IS_PTR_ALIGNED_NX16(t)    ((XAI_PTR_TO_ADDR(XAI_TILE3D_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_TILE3D_IS_PTR_ALIGNED_N_2X32(t)  ((XAI_PTR_TO_ADDR(XAI_TILE3D_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_TILE3D_IS_PTR_ALIGNED_4B(t)      ((XAI_PTR_TO_ADDR(XAI_TILE3D_GET_DATA_PTR(t)) & 3) == 0)
+
+
+#define XAI_TILE3D_IS_ALIGNED_NX8(t)     (XAI_TILE3D_IS_PTR_ALIGNED_NX8(t) && XAI_TILE3D_IS_STRIDE_ALIGNED(t))
+#define XAI_TILE3D_IS_ALIGNED_2NX8(t)    (XAI_TILE3D_IS_PTR_ALIGNED_2NX8(t) && XAI_TILE3D_IS_STRIDE_ALIGNED2(t))
+#define XAI_TILE3D_IS_ALIGNED_NX16(t)    (XAI_TILE3D_IS_PTR_ALIGNED_NX16(t) && XAI_TILE3D_IS_STRIDE_ALIGNED(t))
+#define XAI_TILE3D_IS_ALIGNED_N_2X32(t)  (XAI_TILE3D_IS_PTR_ALIGNED_N_2X32(t) && XAI_TILE3D_IS_STRIDE_ALIGNED_2(t))
+#define XAI_TILE3D_IS_ALIGNED_4B(t)      (XAI_TILE3D_IS_PTR_ALIGNED_4B(t) && XAI_TILE3D_IS_STRIDE_ALIGNED_4B(t))
+
+#define XAI_CHECK_TILE3D_ALIGNMENT(array, DEPTH, ERR) \
+  XAI_CHECK_ERROR(XAI_TILE3D_IS_ALIGNED_ ## DEPTH(array), XAI_ERR_ ## ERR, "The argument (" #array ") is not fully aligned")
+
+#define XAI_CHECK_TILE3D_IALIGNMENT_NX8(array)     XAI_CHECK_TILE3D_ALIGNMENT(array, NX8, IALIGNMENT)
+#define XAI_CHECK_TILE3D_IALIGNMENT_2NX8(array)    XAI_CHECK_TILE3D_ALIGNMENT(array, 2NX8, IALIGNMENT)
+#define XAI_CHECK_TILE3D_IALIGNMENT_NX16(array)    XAI_CHECK_TILE3D_ALIGNMENT(array, NX16, IALIGNMENT)
+#define XAI_CHECK_TILE3D_IALIGNMENT_N_2X32(array)  XAI_CHECK_TILE3D_ALIGNMENT(array, N_2X32, IALIGNMENT)
+#define XAI_CHECK_TILE3D_OALIGNMENT_NX8(array)     XAI_CHECK_TILE3D_ALIGNMENT(array, NX8, OALIGNMENT)
+#define XAI_CHECK_TILE3D_OALIGNMENT_2NX8(array)    XAI_CHECK_TILE3D_ALIGNMENT(array, 2NX8, OALIGNMENT)
+#define XAI_CHECK_TILE3D_OALIGNMENT_NX16(array)    XAI_CHECK_TILE3D_ALIGNMENT(array, NX16, OALIGNMENT)
+#define XAI_CHECK_TILE3D_OALIGNMENT_N_2X32(array)  XAI_CHECK_TILE3D_ALIGNMENT(array, N_2X32, OALIGNMENT)
+#define XAI_CHECK_TILE3D_CALIGNMENT_NX8(array)     XAI_CHECK_TILE3D_ALIGNMENT(array, NX8, IALIGNMENT)
+#define XAI_CHECK_TILE3D_CALIGNMENT_2NX8(array)    XAI_CHECK_TILE3D_ALIGNMENT(array, 2NX8, IALIGNMENT)
+#define XAI_CHECK_TILE3D_CALIGNMENT_NX16(array)    XAI_CHECK_TILE3D_ALIGNMENT(array, NX16, IALIGNMENT)
+#define XAI_CHECK_TILE3D_CALIGNMENT_N_2X32(array)  XAI_CHECK_TILE3D_ALIGNMENT(array, N_2X32, IALIGNMENT)
+
+// check the minimal alignment requirements for 4D tile
+#define XAI_TILE4D_IS_STRIDE_ALIGNED(t)      ((XAI_TILE4D_GET_DIM1_PITCH(t) & (XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_TILE4D_IS_STRIDE_ALIGNED2(t)     ((XAI_TILE4D_GET_DIM1_PITCH(t) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_TILE4D_IS_STRIDE_ALIGNED_2(t)    ((XAI_TILE4D_GET_DIM1_PITCH(t) & (XCHAL_IVPN_SIMD_WIDTH / 2 - 1)) == 0)
+#define XAI_TILE4D_IS_PTR_ALIGNED_NX8(t)     ((XAI_PTR_TO_ADDR(XAI_TILE4D_GET_DATA_PTR(t)) & (XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_TILE4D_IS_PTR_ALIGNED_2NX8(t)    ((XAI_PTR_TO_ADDR(XAI_TILE4D_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_TILE4D_IS_PTR_ALIGNED_NX16(t)    ((XAI_PTR_TO_ADDR(XAI_TILE4D_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+#define XAI_TILE4D_IS_PTR_ALIGNED_N_2X32(t)  ((XAI_PTR_TO_ADDR(XAI_TILE4D_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)
+
+#define XAI_TILE4D_IS_ALIGNED_NX8(t)         (XAI_TILE4D_IS_PTR_ALIGNED_NX8(t) && XAI_TILE4D_IS_STRIDE_ALIGNED(t))
+#define XAI_TILE4D_IS_ALIGNED_2NX8(t)        (XAI_TILE4D_IS_PTR_ALIGNED_2NX8(t) && XAI_TILE4D_IS_STRIDE_ALIGNED2(t))
+#define XAI_TILE4D_IS_ALIGNED_NX16(t)        (XAI_TILE4D_IS_PTR_ALIGNED_NX16(t) && XAI_TILE4D_IS_STRIDE_ALIGNED(t))
+#define XAI_TILE4D_IS_ALIGNED_N_2X32(t)      (XAI_TILE4D_IS_PTR_ALIGNED_N_2X32(t) && XAI_TILE4D_IS_STRIDE_ALIGNED_2(t))
+
+#define XAI_CHECK_TILE4D_ALIGNMENT(array, DEPTH, ERR) \
+  XAI_CHECK_ERROR(XAI_TILE4D_IS_ALIGNED_ ## DEPTH(array), XAI_ERR_ ## ERR, "The argument (" #array ") is not fully aligned")
+
+#define XAI_CHECK_TILE4D_IALIGNMENT_NX8(array)           XAI_CHECK_TILE4D_ALIGNMENT(array, NX8, IALIGNMENT)
+#define XAI_CHECK_TILE4D_IALIGNMENT_2NX8(array)          XAI_CHECK_TILE4D_ALIGNMENT(array, 2NX8, IALIGNMENT)
+#define XAI_CHECK_TILE4D_IALIGNMENT_NX16(array)          XAI_CHECK_TILE4D_ALIGNMENT(array, NX16, IALIGNMENT)
+#define XAI_CHECK_TILE4D_IALIGNMENT_N_2X32(array)        XAI_CHECK_TILE4D_ALIGNMENT(array, N_2X32, IALIGNMENT)
+#define XAI_CHECK_TILE4D_OALIGNMENT_NX8(array)           XAI_CHECK_TILE4D_ALIGNMENT(array, NX8, OALIGNMENT)
+#define XAI_CHECK_TILE4D_OALIGNMENT_2NX8(array)          XAI_CHECK_TILE4D_ALIGNMENT(array, 2NX8, OALIGNMENT)
+#define XAI_CHECK_TILE4D_OALIGNMENT_NX16(array)          XAI_CHECK_TILE4D_ALIGNMENT(array, NX16, OALIGNMENT)
+#define XAI_CHECK_TILE4D_OALIGNMENT_N_2X32(array)        XAI_CHECK_TILE4D_ALIGNMENT(array, N_2X32, OALIGNMENT)
+
+#define XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(tile0, tile1)  XAI_CHECK_ARRAYS_ARE_NOT_OVERLAP(tile0, tile1)
+#define XAI_CHECK_TILES4D_ARE_NOT_OVERLAP(tile0, tile1)  XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(tile0, tile1)
+
+#define XAI_CHECK_TILE3D_EQUAL(tile1, tile2)                                                  \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(tile1) == XAI_TILE3D_GET_DIM1(tile2) &&                 \
+                  XAI_TILE3D_GET_DIM2(tile1) == XAI_TILE3D_GET_DIM2(tile2) &&                 \
+                  XAI_TILE3D_GET_DIM3(tile1) == XAI_TILE3D_GET_DIM3(tile2), XAI_ERR_DATASIZE, \
+                  "Tiles sizes are not equal.");
+
+#define XAI_CHECK_TILE4D_EQUAL(tile1, tile2)                                                  \
+  XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(tile1) == XAI_TILE4D_GET_DIM1(tile2) &&                 \
+                  XAI_TILE4D_GET_DIM2(tile1) == XAI_TILE4D_GET_DIM2(tile2) &&                 \
+                  XAI_TILE4D_GET_DIM3(tile1) == XAI_TILE4D_GET_DIM3(tile2) &&                 \
+                  XAI_TILE4D_GET_DIM4(tile1) == XAI_TILE4D_GET_DIM4(tile2), XAI_ERR_DATASIZE, \
+                  "Tiles sizes are not equal.");
+
+#define XAI_CHECK_TILE3D_ELEMENT_SIZE_EQ(inT, outT)                                      \
+  XAI_CHECK_ERROR(XAI_TILE3D_GET_ELEMENT_SIZE(inT) == XAI_TILE3D_GET_ELEMENT_SIZE(outT), \
+                  XAI_ERR_DATATYPE, "Input tile element element size must be equal to output tile element size")
+
+#define XAI_CHECK_TILE4D_ELEMENT_SIZE_EQ(inT, outT)                                      \
+  XAI_CHECK_ERROR(XAI_TILE4D_GET_ELEMENT_SIZE(inT) == XAI_TILE4D_GET_ELEMENT_SIZE(outT), \
+                  XAI_ERR_DATATYPE, "Input tile element element size must be equal to output tile element size")
+
+#ifdef XAI_ERROR_CHECKS_RELAXED_REF
+#undef XAI_CHECK_TILE4D_IALIGNMENT_2NX8
+#undef XAI_ARRAY_STARTS_IN_DRAM
+#undef XAI_ARRAY_ENDS_IN_DRAM
+#undef XAI_TILE2D_STARTS_IN_DRAM
+#undef XAI_TILE2D_ENDS_IN_DRAM
+#undef XAI_TILE3D_START_AND_END_IN_SINGLE_DRAM
+#undef XAI_TILE4D_START_AND_END_IN_SINGLE_DRAM
+#undef XAI_ARRAY_START_AND_END_IN_SINGLE_DRAM
+#undef XAI_ARRAYS_ARE_NOT_OVERLAP
+
+#define XAI_CHECK_TILE4D_IALIGNMENT_2NX8(array)
+#define XAI_ARRAY_STARTS_IN_DRAM(t)                 1
+#define XAI_ARRAY_ENDS_IN_DRAM(t)                   1
+#define XAI_TILE2D_STARTS_IN_DRAM(t)                  1
+#define XAI_TILE2D_ENDS_IN_DRAM(t)                    1
+#define XAI_TILE3D_START_AND_END_IN_SINGLE_DRAM(t)  1
+#define XAI_TILE4D_START_AND_END_IN_SINGLE_DRAM(t)  1
+#define XAI_ARRAY_START_AND_END_IN_SINGLE_DRAM(t)   1
+#define XAI_ARRAYS_ARE_NOT_OVERLAP(t1, t2)          1
+#endif
+
+#if defined SYS_MEM_TESTING || defined XAI_ERROR_CHECKS_RELAXED_REF
+#undef XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY
+#define XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(tile)
+#endif
+
+// other macros
+#define XAI_TO_Q15(val)     ((int16_t) ((val) * (1 << 15) + 0.5))
+#define XAI_TO_Q1_14(val)   ((int16_t) ((val) * (1 << 14) + 0.5))
+#define XAI_TO_Q2_13(val)   ((int16_t) ((val) * (1 << 13) + 0.5))
+#define XAI_TO_Q3_12(val)   ((int16_t) ((val) * (1 << 12) + 0.5))
+#define XAI_TO_Q4_11(val)   ((int16_t) ((val) * (1 << 11) + 0.5))
+#define XAI_TO_Q5_10(val)   ((int16_t) ((val) * (1 << 10) + 0.5))
+#define XAI_TO_Q13_18(val)  ((int) ((val) * (1 << 18) + 0.5))
+#define XAI_Q0_16_HALF  0x8000
+#endif
diff --git a/backends/cadence/vision/third-party/libxai_common/include/xai_core_api.h b/backends/cadence/vision/third-party/libxai_common/include/xai_core_api.h
new file mode 100644
index 00000000000..894ae20b9d7
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/include/xai_core_api.h
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2013-2018 Tensilica Inc. ALL RIGHTS RESERVED.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __XAI_CORE_API_H__
+#define __XAI_CORE_API_H__
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+
+#include "xai_config_api.h"
+#include "xai_tile_manager.h"
+
+/* library information */
+// _XAI_API_ is defined in glow/externalbackends/Xtensa/Backends/libxai/libxai.h and xtensa-mlir-dialect/include/xtensa/Conversion/xaicnn.h
+// They dont use _XAI_API_ from xaicnn/libxai/include/xai_config_api.h and hence they dont get _XAI_API_VAR_
+// defining _XAI_API_VAR_ for those cases.
+
+#ifndef _XAI_API_VAR_
+#define _XAI_API_VAR_  _XAI_API_
+#endif
+
+_XAI_API_VAR_ char XAI_BUILD_CONFIGURATION[];
+_XAI_API_VAR_ char XAI_BUILD_TOOLS_VERSION[];
+_XAI_API_VAR_ char XAI_BUILD_CORE_ID[];
+_XAI_API_VAR_ char XAI_BUILD_ERROR_LEVEL[];
+_XAI_API_VAR_ char XAI_BUILD_FEATURES_STR[];
+
+/* Math constants */
+
+#define XAI_PI    3.14159265358979323846
+#define XAI_PI_F  3.14159265358979323846f
+
+/* IVP library data types */
+
+typedef int32_t    XAI_ERR_TYPE;
+typedef uint8_t    xai_bool;
+
+typedef int16_t    XAI_Q0_15;
+typedef int16_t    XAI_Q5_10;
+typedef int16_t    XAI_Q6_9;
+typedef int16_t    XAI_Q7_8;
+typedef int16_t    XAI_Q8_7;
+typedef int16_t    XAI_Q12_3;
+typedef int16_t    XAI_Q13_2;
+
+typedef int32_t    XAI_Q0_31;
+typedef int32_t    XAI_Q1_30;
+typedef int32_t    XAI_Q12_19;
+typedef int32_t    XAI_Q13_18;
+typedef int32_t    XAI_Q15_16;
+typedef int32_t    XAI_Q16_15;
+typedef int32_t    XAI_Q22_9;
+typedef int32_t    XAI_Q28_3;
+
+typedef XAI_Q0_15  XAI_Q15;
+typedef uint16_t   XAI_Q0_16;
+
+
+typedef struct
+{
+  int16_t x;
+  int16_t y;
+} xai_point;
+
+typedef struct
+{
+  int32_t x;
+  int32_t y;
+} xai_point32;
+
+typedef struct
+{
+  XAI_Q16_15 x;
+  XAI_Q16_15 y;
+} xai_point_fpt;
+
+typedef struct
+{
+  float x;
+  float y;
+} xai_point_f;
+
+typedef struct
+{
+  int32_t width;
+  int32_t height;
+} xai_size;
+
+typedef struct
+{
+  float a11;
+  float a12;
+  float a21;
+  float a22;
+  float xt;
+  float yt;
+} xai_affine;
+
+typedef struct
+{
+  XAI_Q13_18 a11;
+  XAI_Q13_18 a12;
+  XAI_Q13_18 a21;
+  XAI_Q13_18 a22;
+  XAI_Q13_18 xt;
+  XAI_Q13_18 yt;
+} xai_affine_fpt;
+
+typedef struct
+{
+  float a11;
+  float a12;
+  float a13;
+  float a21;
+  float a22;
+  float a23;
+  float a31;
+  float a32;
+  float a33;
+} xai_perspective;
+
+typedef struct
+{
+  XAI_Q13_18 a11;
+  XAI_Q13_18 a12;
+  XAI_Q13_18 a13;
+  XAI_Q13_18 a21;
+  XAI_Q13_18 a22;
+  XAI_Q13_18 a23;
+  XAI_Q13_18 a31;
+  XAI_Q13_18 a32;
+  XAI_Q13_18 a33;
+} xai_perspective_fpt;
+
+typedef struct
+{
+  int16_t  x;
+  int16_t  y;
+  uint16_t width;
+  uint16_t height;
+} xai_rect;
+
+typedef struct
+{
+  int16_t   x;
+  int16_t   y;
+  uint16_t  width;
+  uint16_t  height;
+  XAI_Q5_10 angle;
+} xai_rotated_rect;
+
+typedef struct
+{
+  float x;
+  float y;
+  float width;
+  float height;
+  float angle;
+} xai_rotated_rect_f;
+
+typedef struct
+{
+  int32_t M00;
+  int64_t M10;
+  int64_t M01;
+  int64_t M11;
+  int64_t M20;
+  int64_t M02;
+} xai_moments;
+
+typedef struct
+{
+  XAI_Q13_18 rho;
+  XAI_Q13_18 theta;
+} xai_line_polar_fpt;
+
+typedef struct
+{
+  uint32_t size;      // number of pyramid levels
+  float    scale;
+  xai_tile2D **levels;   // array of pyramid levels
+} xai_pyramid, *xai_pPyramid;
+#define XAI_HAS_PYRAMID  1
+
+
+/* Error codes */
+
+#define XAI_ERR_OK                 0  // no error
+#define XAI_ERR_IALIGNMENT         1  // input alignment requirements are not satisfied
+#define XAI_ERR_OALIGNMENT         2  // output alignment requirements are not satisfied
+#define XAI_ERR_MALIGNMENT         3  // same modulo alignment requirement is not satisfied
+#define XAI_ERR_BADARG             4  // arguments are somehow invalid
+#define XAI_ERR_MEMLOCAL           5  // tile is not placed in local memory
+#define XAI_ERR_INPLACE            6  // inplace operation is not supported
+#define XAI_ERR_EDGE               7  // edge extension size is too small
+#define XAI_ERR_DATASIZE           8  // input/output tile size is too small or too big or otherwise inconsistent
+#define XAI_ERR_TMPSIZE            9  // temporary tile size is too small or otherwise inconsistent
+#define XAI_ERR_KSIZE              10 // filer kernel size is not supported
+#define XAI_ERR_NORM               11 // invalid normalization divisor or shift value
+#define XAI_ERR_COORD              12 // invalid coordinates
+#define XAI_ERR_BADTRANSFORM       13 // the transform is singular or otherwise invalid
+#define XAI_ERR_NULLARG            14 // one of required arguments is null
+#define XAI_ERR_THRESH_INVALID     15 // threshold value is somehow invalid
+#define XAI_ERR_SCALE              16 // provided scale factor is not supported
+#define XAI_ERR_OVERFLOW           17 // tile size can lead to sum overflow
+#define XAI_ERR_NOTIMPLEMENTED     18 // the requested functionality is absent in current version
+#define XAI_ERR_CHANNEL_INVALID    19 // invalid channel number
+#define XAI_ERR_DATATYPE           20 // argument has invalid data type
+#define XAI_ERR_NO_VARIANT         21 // No suitable variant found for the function
+#define XAI_ERR_PTR_NULL           22 // Pointer is NULL
+#define XAI_ERR_CUSTOMACC_PREPARE  23 // fails to prepare the custom acc hardware
+#define XAI_ERR_CUSTOMACC_EXECUTE  24 // fails to execute ops on the custom acc hardware
+#define XAI_ERR_CUSTOMACC_REMOVE   25 // fails to remove a network for the custom acc hardware
+#define XAI_ERR_LAST               25
+
+/* non-fatal errors */
+
+#define XAI_ERR_POOR_DECOMPOSITION  1024 // computed transform decomposition can produce visual artifacts
+#define XAI_ERR_OUTOFTILE           1025 // arguments or results are out of tile
+#define XAI_ERR_OBJECTLOST          1026 // tracked object is lost
+#define XAI_ERR_RANSAC_NOTFOUND     1027 // there is no found appropriate model for RANSAC
+#define XAI_ERR_REPLAY              1028 // function has to be called again for completion
+
+
+/* helper macro */
+
+#ifdef XCHAL_IVPN_SIMD_WIDTH
+#  define XAI_SIMD_WIDTH  XCHAL_IVPN_SIMD_WIDTH
+#else
+#  define XAI_SIMD_WIDTH  32
+#endif
+
+#define XAI_SIZE_AREA(sz)              ((size_t) sz.width * sz.height)
+#define XAI_ALIGN_VAL(val, pow2)       (((val) + ((pow2) - 1)) & ~((pow2) - 1))
+#define XAI_ALIGN_VALN(val)            XAI_ALIGN_VAL(val, XAI_SIMD_WIDTH)
+
+#define XAI_PTR_TO_ADDR(ptr)           ((uintptr_t) (ptr))
+#define XAI_ALIGN_PTR(ptr, alignment)  ((void *) XAI_ALIGN_VAL(XAI_PTR_TO_ADDR((ptr)), (alignment)))
+
+/* temporary space requirement for xaiSort */
+#if XCHAL_HAVE_GRIVPEP_HISTOGRAM || XCHAL_HAVE_VISION_HISTOGRAM
+#   define XAI_SORT_TMP_SIZE  0                                       // use vector registers only
+#elif XCHAL_HAVE_VISION
+#   define XAI_SORT_TMP_SIZE  (XAI_SIMD_WIDTH * 256 + XAI_SIMD_WIDTH)   // SIMD_WIDTH histograms by 256 bins + 32 for pointer alignment inside optimized function
+#else
+#   define XAI_SORT_TMP_SIZE  (2 * 256 + XAI_SIMD_WIDTH)               // 3 histograms by 256 bins + 32 for pointer alignment inside optimized function
+#endif
+
+
+/* error code to text conversion */
+_XAI_API_ const char* xaiErrStr(XAI_ERR_TYPE code);
+#endif
diff --git a/backends/cadence/vision/third-party/libxai_common/include/xai_tile_manager.h b/backends/cadence/vision/third-party/libxai_common/include/xai_tile_manager.h
new file mode 100644
index 00000000000..71227adbf34
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/include/xai_tile_manager.h
@@ -0,0 +1,1246 @@
+/*
+ * Copyright (c) 2013-2018 Tensilica Inc. ALL RIGHTS RESERVED.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __XAI_TILE_MANAGER_H__
+#define __XAI_TILE_MANAGER_H__
+
+#include <stdint.h>
+#include <stddef.h>
+#include "xai_config_api.h"
+
+typedef struct xaiFrameStruct
+{
+  void     *pFrameBuff;
+  uint32_t frameBuffSize;
+  void     *pFrameData;
+  int32_t  frameWidth;
+  int32_t  frameHeight;
+  int32_t  framePitch;
+  uint8_t  pixelRes;
+  uint8_t  pixelPackFormat;
+} xai_frame, *xai_pFrame;
+
+#define XAI_ARRAY_FIELDS \
+  void *pBuffer;         \
+  void *pData;           \
+  uint32_t bufferSize;   \
+  int32_t width;         \
+  int32_t pitch;         \
+  uint32_t status;       \
+  uint16_t type;         \
+  int32_t height;
+
+typedef struct xaiArrayStruct
+{
+  XAI_ARRAY_FIELDS
+} xai_array, *xai_pArray;
+
+#define XAI_ARRAY_FIELDS_COEFF_32 \
+  uintptr_t pBuffer;              \
+  uintptr_t pData;                \
+  uint64_t bufferSize;            \
+  uint64_t width;                 \
+  int64_t pitch;                  \
+  uint32_t status;                \
+  uint16_t type;                  \
+  int32_t height;
+
+typedef struct xaiArrayStruct_coeff_32
+{
+  XAI_ARRAY_FIELDS_COEFF_32
+} xai_array_coeff_32, *xai_pArray_coeff_32;
+
+#define XAI_ARRAY_FIELDS_COEFF_64 \
+  uint64_t pBuffer;               \
+  uint64_t pData;                 \
+  uint64_t bufferSize;            \
+  uint64_t width;                 \
+  int64_t pitch;                  \
+  uint32_t status;                \
+  uint16_t type;                  \
+  int32_t height;
+
+typedef struct xaiArrayStruct_coeff_64
+{
+  XAI_ARRAY_FIELDS_COEFF_64
+} xai_array_coeff_64, *xai_pArray_coeff_64;
+
+typedef struct xaiTile2DStruct
+{
+  XAI_ARRAY_FIELDS
+  xai_frame *pFrame;
+  int32_t   x;
+  int32_t   y;
+  uint16_t  edgeWidth;
+  uint16_t  edgeHeight;
+} xai_tile2D, *xai_pTile2D;
+
+/*****************************************
+*   Data type definitions
+*****************************************/
+
+//** 16 bit data type, bit 0 - 3 for data encoded depth(bits/bytes), 5 - 7 free bits (reserved for future use),  bit 8 - 10 for encoded special type float
+//** 11 bit for float (denotes whether float or not), 12 - 14 bit for encoded tile type, and 15 bit for data sign
+
+#define XAI_TYPE_SIGNED_BIT          (1 << 15)
+
+#define XAI_TYPE_ARRAY_BITS          (1 << 12)
+#define XAI_TYPE_TILE2D_BITS         (2 << 12)
+#define XAI_TYPE_TILE3D_BITS         (3 << 12)
+#define XAI_TYPE_TILE4D_BITS         (4 << 12)
+#define XAI_TYPE_TILE5D_BITS         (5 << 12)
+#define XAI_TYPE_TILE6D_BITS         (6 << 12)
+#define XAI_TYPE_TILE_BITS           3
+#define XAI_TYPE_TILE_MASK           (((1 << XAI_TYPE_TILE_BITS) - 1) << 12)
+
+#define XAI_TYPE_FLOAT_BIT           (1 << 11)
+#define XAI_TYPE_SPECIAL_FLOAT_BITS  3
+#define XAI_TYPE_SPECIAL_FLOAT_MASK  (((1 << XAI_TYPE_SPECIAL_FLOAT_BITS) - 1) << 8)
+#define XAI_TYPE_BFLOAT_BIT          (XAI_TYPE_FLOAT_BIT | (1 << 8))
+
+#define XAI_TYPE_ELEMENT_SIZE_BITS   4
+#define XAI_TYPE_ELEMENT_SIZE_MASK   ((1 << XAI_TYPE_ELEMENT_SIZE_BITS) - 1)
+
+#define XAI_MAKETYPE(flags, depth)            ((flags) | (depth))
+#define XAI_CUSTOMTYPE(type)                  XAI_MAKETYPE(0, (sizeof(type) + 2))   //convert byte to representation sequence
+
+#define XAI_TYPE_ELEMENT_SIZE_IN_BYTES(type)  (1 << (((type) & (XAI_TYPE_ELEMENT_SIZE_MASK)) - 3))
+#define XAI_TYPE_ELEMENT_SIZE_IN_BITS(type)   (XAI_TYPE_ELEMENT_SIZE_IN_BYTES(type) << 3)
+#define XAI_TYPE_ELEMENT_SIZE(type)           XAI_TYPE_ELEMENT_SIZE_IN_BYTES(type)
+#define XAI_TYPE_ELEMENT_TYPE(type)           ((type) & (XAI_TYPE_SIGNED_BIT | XAI_TYPE_ELEMENT_SIZE_MASK | XAI_TYPE_FLOAT_BIT | XAI_TYPE_SPECIAL_FLOAT_MASK))
+#define XAI_TYPE_IS_ARRAY(type)               (!(((type) & (XAI_TYPE_TILE_MASK)) ^ XAI_TYPE_ARRAY_BITS))
+#define XAI_TYPE_IS_TILE2D(type)              (!(((type) & (XAI_TYPE_TILE_MASK)) ^ XAI_TYPE_TILE2D_BITS))
+#define XAI_TYPE_IS_SIGNED(type)              ((type) & (XAI_TYPE_SIGNED_BIT))
+
+// XAI_MAKETYPE accepts 2 parameters
+// 1: Denotes whether the entity is a tile(XAI_TYPE_TILE2D_BITS, XAI_TYPE_TILE3D_BITS etc. flag set) or an array(XAI_TYPE_ARRAY_BITS flag set) ,
+//    ,if the data is a signed or unsigned(XAI_TYPE_SIGNED_BIT) and also if data is float(XAI_TYPE_FLOAT_BIT) and float type(XAI_TYPE_BFLOAT_BIT etc.)
+// 2: Denotes encoded number of bits/bytes
+//    0 implies the data is bool, 1 implies the data is 2 bit, 2 implies the data is 4bit, 3  implies the data is 8bit, 4 implies the data is 16bit.
+//    5 implies the data is 32bit, 6 implies the data is 64bit and 7 implies the data is 128bit
+
+#define XAI_BOOL         XAI_MAKETYPE(0, 0)
+#define XAI_U2           XAI_MAKETYPE(0, 1)
+#define XAI_U4           XAI_MAKETYPE(0, 2)
+#define XAI_U8           XAI_MAKETYPE(0, 3)
+#define XAI_U16          XAI_MAKETYPE(0, 4)
+#define XAI_U32          XAI_MAKETYPE(0, 5)
+#define XAI_U64          XAI_MAKETYPE(0, 6)
+#define XAI_U128         XAI_MAKETYPE(0, 7)
+
+#define XAI_S2           XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT, 1)
+#define XAI_S4           XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT, 2)
+#define XAI_S8           XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT, 3)
+#define XAI_S16          XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT, 4)
+#define XAI_S32          XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT, 5)
+#define XAI_S64          XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT, 6)
+#define XAI_S128         XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT, 7)
+
+#define XAI_F8           (XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT | XAI_TYPE_FLOAT_BIT, 3))
+#define XAI_F16          (XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT | XAI_TYPE_FLOAT_BIT, 4))
+#define XAI_F32          (XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT | XAI_TYPE_FLOAT_BIT, 5))
+#define XAI_F64          (XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT | XAI_TYPE_FLOAT_BIT, 6))
+#define XAI_F128         (XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT | XAI_TYPE_FLOAT_BIT, 7))
+
+#define XAI_ARRAY_BOOL   (XAI_BOOL | XAI_TYPE_ARRAY_BITS)
+#define XAI_ARRAY_U4     (XAI_U4 | XAI_TYPE_ARRAY_BITS)
+#define XAI_ARRAY_U8     (XAI_U8 | XAI_TYPE_ARRAY_BITS)
+#define XAI_ARRAY_U16    (XAI_U16 | XAI_TYPE_ARRAY_BITS)
+#define XAI_ARRAY_U32    (XAI_U32 | XAI_TYPE_ARRAY_BITS)
+#define XAI_ARRAY_U64    (XAI_U64 | XAI_TYPE_ARRAY_BITS)
+#define XAI_ARRAY_U128   (XAI_U128 | XAI_TYPE_ARRAY_BITS)
+
+#define XAI_ARRAY_S4     (XAI_S4 | XAI_TYPE_ARRAY_BITS)
+#define XAI_ARRAY_S8     (XAI_S8 | XAI_TYPE_ARRAY_BITS)
+#define XAI_ARRAY_S16    (XAI_S16 | XAI_TYPE_ARRAY_BITS)
+#define XAI_ARRAY_S32    (XAI_S32 | XAI_TYPE_ARRAY_BITS)
+#define XAI_ARRAY_S64    (XAI_S64 | XAI_TYPE_ARRAY_BITS)
+#define XAI_ARRAY_S128   (XAI_S128 | XAI_TYPE_ARRAY_BITS)
+
+#define XAI_ARRAY_F8     (XAI_F8 | XAI_TYPE_ARRAY_BITS)
+#define XAI_ARRAY_F16    (XAI_F16 | XAI_TYPE_ARRAY_BITS)
+#define XAI_ARRAY_F32    (XAI_F32 | XAI_TYPE_ARRAY_BITS)
+#define XAI_ARRAY_F64    (XAI_F64 | XAI_TYPE_ARRAY_BITS)
+#define XAI_ARRAY_F128   (XAI_F128 | XAI_TYPE_ARRAY_BITS)
+
+#define XAI_TILE2D_BOOL  (XAI_BOOL | XAI_TYPE_TILE2D_BITS)
+#define XAI_TILE2D_U4    (XAI_U4 | XAI_TYPE_TILE2D_BITS)
+#define XAI_TILE2D_U8    (XAI_U8 | XAI_TYPE_TILE2D_BITS)
+#define XAI_TILE2D_U16   (XAI_U16 | XAI_TYPE_TILE2D_BITS)
+#define XAI_TILE2D_U32   (XAI_U32 | XAI_TYPE_TILE2D_BITS)
+#define XAI_TILE2D_U64   (XAI_U64 | XAI_TYPE_TILE2D_BITS)
+#define XAI_TILE2D_U128  (XAI_U128 | XAI_TYPE_TILE2D_BITS)
+
+#define XAI_TILE2D_S4    (XAI_S4 | XAI_TYPE_TILE2D_BITS)
+#define XAI_TILE2D_S8    (XAI_S8 | XAI_TYPE_TILE2D_BITS)
+#define XAI_TILE2D_S16   (XAI_S16 | XAI_TYPE_TILE2D_BITS)
+#define XAI_TILE2D_S32   (XAI_S32 | XAI_TYPE_TILE2D_BITS)
+#define XAI_TILE2D_S64   (XAI_S64 | XAI_TYPE_TILE2D_BITS)
+#define XAI_TILE2D_S128  (XAI_S128 | XAI_TYPE_TILE2D_BITS)
+
+#define XAI_TILE2D_F8    (XAI_F8 | XAI_TYPE_TILE2D_BITS)
+#define XAI_TILE2D_F16   (XAI_F16 | XAI_TYPE_TILE2D_BITS)
+#define XAI_TILE2D_F32   (XAI_F32 | XAI_TYPE_TILE2D_BITS)
+#define XAI_TILE2D_F64   (XAI_F64 | XAI_TYPE_TILE2D_BITS)
+#define XAI_TILE2D_F128  (XAI_F128 | XAI_TYPE_TILE2D_BITS)
+
+/*****************************************
+*                   Frame Access Macros
+*****************************************/
+#define XAI_FRAME_GET_BUFF_PTR(pFrame)                   ((pFrame)->pFrameBuff)
+#define XAI_FRAME_SET_BUFF_PTR(pFrame, pBuff)            (pFrame)->pFrameBuff = ((void *) (pBuff))
+
+#define XAI_FRAME_GET_BUFF_SIZE(pFrame)                  ((pFrame)->frameBuffSize)
+#define XAI_FRAME_SET_BUFF_SIZE(pFrame, buffSize)        (pFrame)->frameBuffSize = ((uint32_t) (buffSize))
+
+#define XAI_FRAME_GET_DATA_PTR(pFrame)                   ((pFrame)->pFrameData)
+#define XAI_FRAME_SET_DATA_PTR(pFrame, pData)            (pFrame)->pFrameData = ((void *) (pData))
+
+#define XAI_FRAME_GET_WIDTH(pFrame)                      ((pFrame)->frameWidth)
+#define XAI_FRAME_SET_WIDTH(pFrame, width)               (pFrame)->frameWidth = ((int32_t) (width))
+
+#define XAI_FRAME_GET_HEIGHT(pFrame)                     ((pFrame)->frameHeight)
+#define XAI_FRAME_SET_HEIGHT(pFrame, height)             (pFrame)->frameHeight = ((int32_t) (height))
+
+#define XAI_FRAME_GET_PITCH(pFrame)                      ((pFrame)->framePitch)
+#define XAI_FRAME_SET_PITCH(pFrame, pitch)               (pFrame)->framePitch = ((int32_t) (pitch))
+
+#define XAI_FRAME_GET_PIXEL_RES(pFrame)                  ((pFrame)->pixelRes)
+#define XAI_FRAME_SET_PIXEL_RES(pFrame, pixRes)          (pFrame)->pixelRes = ((uint8_t) (pixRes))
+
+#define XAI_FRAME_GET_PIXEL_FORMAT(pFrame)               ((pFrame)->pixelPackFormat)
+#define XAI_FRAME_SET_PIXEL_FORMAT(pFrame, pixelFormat)  (pFrame)->pixelPackFormat = ((uint8_t) (pixelFormat))
+
+/*****************************************
+*                   Array Access Macros
+*****************************************/
+#define XAI_ARRAY_GET_BUFF_PTR(pArray)                    ((pArray)->pBuffer)
+#define XAI_ARRAY_SET_BUFF_PTR(pArray, pBuff)             (pArray)->pBuffer = ((void *) (pBuff))
+
+#define XAI_ARRAY_GET_BUFF_SIZE(pArray)                   ((pArray)->bufferSize)
+#define XAI_ARRAY_SET_BUFF_SIZE(pArray, buffSize)         (pArray)->bufferSize = (buffSize)
+
+#define XAI_ARRAY_GET_DATA_PTR(pArray)                    ((pArray)->pData)
+#define XAI_ARRAY_SET_DATA_PTR(pArray, pArrayData)        (pArray)->pData = ((void *) (pArrayData))
+
+#define XAI_ARRAY_SET_BUFF_PTR_COEFF(pArray, pBuff)       (pArray)->pBuffer = ((uint64_t) (pBuff))
+#define XAI_ARRAY_SET_DATA_PTR_COEFF(pArray, pArrayData)  (pArray)->pData   = ((uint64_t) (pArrayData))
+
+#define XAI_ARRAY_GET_WIDTH(pArray)                       ((pArray)->width)
+#define XAI_ARRAY_SET_WIDTH(pArray, value)                (pArray)->width = ((int32_t) (value))
+#define XAI_ARRAY_SET_WIDTH_COEFF(pArray, value)          (pArray)->width = ((uint64_t) (value))
+
+#define XAI_ARRAY_GET_PITCH(pArray)                       ((pArray)->pitch)
+#define XAI_ARRAY_SET_PITCH(pArray, value)                (pArray)->pitch = ((int32_t) (value))
+
+#define XAI_ARRAY_GET_HEIGHT(pArray)                      ((pArray)->height)
+#define XAI_ARRAY_SET_HEIGHT(pArray, value)               (pArray)->height = ((uint16_t) (value))
+
+#define XAI_ARRAY_GET_STATUS_FLAGS(pArray)                ((pArray)->status)
+#define XAI_ARRAY_SET_STATUS_FLAGS(pArray, value)         (pArray)->status = ((uint8_t) (value))
+
+#define XAI_ARRAY_GET_TYPE(pArray)                        ((pArray)->type)
+#define XAI_ARRAY_SET_TYPE(pArray, value)                 (pArray)->type = ((uint16_t) (value))
+
+#define XAI_ARRAY_GET_CAPACITY(pArray)                    ((pArray)->pitch)
+#define XAI_ARRAY_SET_CAPACITY(pArray, value)             (pArray)->pitch = ((int32_t) (value))
+#define XAI_ARRAY_SET_CAPACITY_COEFF(pArray, value)       (pArray)->pitch = ((int64_t) (value))
+
+#define XAI_ARRAY_GET_ELEMENT_TYPE(pArray)                (XAI_TYPE_ELEMENT_TYPE(XAI_ARRAY_GET_TYPE(pArray)))
+#define XAI_ARRAY_GET_ELEMENT_SIZE(pArray)                (XAI_TYPE_ELEMENT_SIZE(XAI_ARRAY_GET_TYPE(pArray)))
+#define XAI_ARRAY_IS_TILE2D(pArray)                       (!(((XAI_ARRAY_GET_TYPE(pArray)) & (XAI_TYPE_TILE_MASK)) ^ XAI_TYPE_TILE2D_BITS))
+
+#define XAI_ARRAY_GET_AREA(pArray)                        (((pArray)->width) * ((int32_t) (pArray)->height))
+
+/*****************************************
+*                   Tile Access Macros
+*****************************************/
+#define XAI_TILE2D_GET_BUFF_PTR      XAI_ARRAY_GET_BUFF_PTR
+#define XAI_TILE2D_SET_BUFF_PTR      XAI_ARRAY_SET_BUFF_PTR
+
+#define XAI_TILE2D_GET_BUFF_SIZE     XAI_ARRAY_GET_BUFF_SIZE
+#define XAI_TILE2D_SET_BUFF_SIZE     XAI_ARRAY_SET_BUFF_SIZE
+
+#define XAI_TILE2D_GET_DATA_PTR      XAI_ARRAY_GET_DATA_PTR
+#define XAI_TILE2D_SET_DATA_PTR      XAI_ARRAY_SET_DATA_PTR
+
+#define XAI_TILE2D_GET_WIDTH         XAI_ARRAY_GET_WIDTH
+#define XAI_TILE2D_SET_WIDTH         XAI_ARRAY_SET_WIDTH
+
+#define XAI_TILE2D_GET_PITCH         XAI_ARRAY_GET_PITCH
+#define XAI_TILE2D_SET_PITCH         XAI_ARRAY_SET_PITCH
+
+#define XAI_TILE2D_GET_HEIGHT        XAI_ARRAY_GET_HEIGHT
+#define XAI_TILE2D_SET_HEIGHT        XAI_ARRAY_SET_HEIGHT
+
+#define XAI_TILE2D_GET_STATUS_FLAGS  XAI_ARRAY_GET_STATUS_FLAGS
+#define XAI_TILE2D_SET_STATUS_FLAGS  XAI_ARRAY_SET_STATUS_FLAGS
+
+#define XAI_TILE2D_GET_TYPE          XAI_ARRAY_GET_TYPE
+#define XAI_TILE2D_SET_TYPE          XAI_ARRAY_SET_TYPE
+
+#define XAI_TILE2D_GET_ELEMENT_TYPE  XAI_ARRAY_GET_ELEMENT_TYPE
+#define XAI_TILE2D_GET_ELEMENT_SIZE  XAI_ARRAY_GET_ELEMENT_SIZE
+#define XAI_TILE2D_IS_TILE2D         XAI_ARRAY_IS_TILE2D
+
+#define XAI_TILE2D_GET_FRAME_PTR(pTile)             ((pTile)->pFrame)
+#define XAI_TILE2D_SET_FRAME_PTR(pTile, ptrFrame)   (pTile)->pFrame = ((xai_frame *) (ptrFrame))
+
+#define XAI_TILE2D_GET_X_COORD(pTile)               ((pTile)->x)
+#define XAI_TILE2D_SET_X_COORD(pTile, xcoord)       (pTile)->x = ((int32_t) (xcoord))
+
+#define XAI_TILE2D_GET_Y_COORD(pTile)               ((pTile)->y)
+#define XAI_TILE2D_SET_Y_COORD(pTile, ycoord)       (pTile)->y = ((int32_t) (ycoord))
+
+#define XAI_TILE2D_GET_EDGE_WIDTH(pTile)            ((pTile)->edgeWidth)
+#define XAI_TILE2D_SET_EDGE_WIDTH(pTile, eWidth)    ((pTile)->edgeWidth = (uint16_t) eWidth)
+
+#define XAI_TILE2D_GET_EDGE_HEIGHT(pTile)           ((pTile)->edgeHeight)
+#define XAI_TILE2D_SET_EDGE_HEIGHT(pTile, eHeight)  ((pTile)->edgeHeight = (uint16_t) eHeight)
+
+/***********************************
+*              Other Marcos
+***********************************/
+#define XAI_TILE2D_CHECK_VIRTUAL_FRAME(pTile)  ((pTile)->pFrame->pFrameBuff == NULL)
+#define XAI_FRAME_CHECK_VIRTUAL_FRAME(pFrame)  ((pFrame)->pFrameBuff == NULL)
+
+typedef enum { XAI_WHD, XAI_DWH, XAI_ID4WH, XAI_ID16WH, XAI_ID32WH, XAI_WHDN, XAI_NWHD, XAI_NDWH, XAI_DWHN, XAI_IN64DWH, XAI_IN32DWH, XAI_RMOD, XAI_IN16DWH, XAI_MTILE, XAI_CMTILE, XAI_RMOD_DWH_ID16WH, XAI_RMOD_InOutDepth32X, XAI_RMOD_ID4WH, XAI_ID16WHN, XAI_ID32WHN, XAI_IN128DWH, XAI_RMOD_DWH_I16_ID16WH, XAI_RMOD_ID16WH, XAI_RMOD_InOutDepth64X, XAI_UNKNOWN }  xai_cnn_data_order;
+
+/******************************************************************************************************************
+*
+*                    3D definitions - extension of 2D definitions
+*
+* ****************************************************************************************************************/
+typedef struct xai_frame3DStruct
+{
+  void               *pFrameBuff;
+  uint32_t           frameBuffSize;
+  void               *pFrameData;
+  int32_t            dim1Size;
+  int32_t            dim2Size;
+  int32_t            dim1Pitch;       // pitch in width dimension
+  uint8_t            pixelRes;        // in bits
+  uint8_t            pixelPackFormat; // not used in XI library
+  uint16_t           dim1Edge1;
+  uint16_t           dim1Edge2;
+  uint16_t           dim2Edge1;
+  uint16_t           dim2Edge2;
+  uint16_t           dim3Edge1;
+  uint16_t           dim3Edge2;
+  uint8_t            paddingType;
+  // new fields
+  int32_t            dim2Pitch;
+  int32_t            dim3Size;
+  xai_cnn_data_order dataOrder; // WHD, DWH, etc.
+} xai_frame3D, *xai_pFrame3D;
+
+// new access macros
+#define XAI_FRAME3D_GET_DIM1(x)                 ((x)->dim1Size)
+#define XAI_FRAME3D_SET_DIM1(x, v)              ((x)->dim1Size = (v))
+#define XAI_FRAME3D_GET_DIM1_PITCH(x)           ((x)->dim1Pitch)
+#define XAI_FRAME3D_SET_DIM1_PITCH(x, v)        ((x)->dim1Pitch = (v))
+#define XAI_FRAME3D_GET_DIM1_PITCH_IN_BYTES(x)  ((x)->dim1Pitch * ((x)->pixelRes / 8 + ((x)->pixelRes & 7 != 0)))
+#define XAI_FRAME3D_GET_DIM2(x)                 ((x)->dim2Size)
+#define XAI_FRAME3D_SET_DIM2(x, v)              ((x)->dim2Size = (v))
+#define XAI_FRAME3D_GET_DIM2_PITCH(x)           ((x)->dim2Pitch)
+#define XAI_FRAME3D_SET_DIM2_PITCH(x, v)        ((x)->dim2Pitch = (v))
+#define XAI_FRAME3D_GET_DIM2_PITCH_IN_BYTES(x)  ((x)->dim2Pitch * ((x)->pixelRes / 8 + ((x)->pixelRes & 7 != 0)))
+#define XAI_FRAME3D_GET_DIM3(x)                 ((x)->dim3Size)
+#define XAI_FRAME3D_SET_DIM3(x, v)              ((x)->dim3Size = (v))
+#define XAI_FRAME3D_GET_DIM1_EDGE1(x)           ((x)->dim1Edge1)
+#define XAI_FRAME3D_SET_DIM1_EDGE1(x, v)        ((x)->dim1Edge1 = (v))
+#define XAI_FRAME3D_GET_DIM1_EDGE2(x)           ((x)->dim1Edge2)
+#define XAI_FRAME3D_SET_DIM1_EDGE2(x, v)        ((x)->dim1Edge2 = (v))
+#define XAI_FRAME3D_GET_DIM2_EDGE1(x)           ((x)->dim2Edge1)
+#define XAI_FRAME3D_SET_DIM2_EDGE1(x, v)        ((x)->dim2Edge1 = (v))
+#define XAI_FRAME3D_GET_DIM2_EDGE2(x)           ((x)->dim2Edge2)
+#define XAI_FRAME3D_SET_DIM2_EDGE2(x, v)        ((x)->dim2Edge2 = (v))
+#define XAI_FRAME3D_GET_DIM3_EDGE1(x)           ((x)->dim3Edge1)
+#define XAI_FRAME3D_SET_DIM3_EDGE1(x, v)        ((x)->dim3Edge1 = (v))
+#define XAI_FRAME3D_GET_DIM3_EDGE2(x)           ((x)->dim3Edge2)
+#define XAI_FRAME3D_SET_DIM3_EDGE2(x, v)        ((x)->dim3Edge2 = (v))
+#define XAI_FRAME3D_GET_DATA_ORDER(x)           ((x)->dataOrder)
+#define XAI_FRAME3D_SET_DATA_ORDER(x, v)        ((x)->dataOrder = (v))
+
+typedef struct
+{
+  int32_t dim1Size;
+  int32_t dim2Size;
+  int32_t dim3Size;
+} xai_size3D;
+
+typedef struct
+{
+  int32_t dim1Size;
+  int32_t dim2Size;
+  int32_t dim3Size;
+  int32_t dim4Size;
+} xai_size4D;
+
+typedef struct
+{
+  uint16_t dim1Edge1;
+  uint16_t dim1Edge2;
+  uint16_t dim2Edge1;
+  uint16_t dim2Edge2;
+  uint16_t dim3Edge1;
+  uint16_t dim3Edge2;
+} xai_edge3D;
+
+typedef struct
+{
+  int32_t dataType;
+} xai_dataType;
+
+// 3D tile
+#define XAI_TILE3D_FIELDS                                                                  \
+  uint32_t bufferSize;                                                                     \
+  int32_t dim1Size;                                                                        \
+  int32_t dim1Pitch;                                                                       \
+  uint32_t status; /* Currently not used, planned to be obsolete */                        \
+  uint16_t type;                                                                           \
+  int32_t dim2Size;                                                                        \
+  xai_frame3D *pFrame; /* changed to 3D frame */                                           \
+  int32_t dim1Loc;     /* dim1-loc of top-left active pixel in src frame */                \
+  int32_t dim2Loc;     /* dim2-loc of top-left active pixel in src frame */                \
+  uint16_t dim1Edge1;                                                                      \
+  uint16_t dim2Edge1;                                                                      \
+  uint16_t dim1Edge2;                                                                      \
+  uint16_t dim2Edge2;                                                                      \
+  /* new fields */                                                                         \
+  int32_t dim2Pitch;                                                                       \
+  int32_t dim3Size;                                                                        \
+  xai_cnn_data_order dataOrder;                                                            \
+  int32_t dim3Loc; /* dim3-loc of top-left active pixel in src frame */                    \
+  uint16_t dim3Edge1;                                                                      \
+  uint16_t dim3Edge2;                                                                      \
+  /* Number of PTILES in a MEMTILE along a particular dimension. Used for MEMTILES only */ \
+  int16_t numPtilesDim1;                                                                   \
+  int16_t numPtilesDim2;                                                                   \
+  int16_t numPtilesDim3;
+
+typedef struct xai_tile3DStruct
+{
+  void *pBuffer;
+  void *pData;
+  XAI_TILE3D_FIELDS
+} xai_tile3D, *xai_pTile3D;
+
+typedef struct xai_tile3DStruct_64
+{
+  uint64_t pBuffer;
+  uint64_t pData;
+  XAI_TILE3D_FIELDS
+} xai_tile3D_64, *xai_pTile3D_64;
+
+#define XAI_TILE3D_GET_DIM1(x)           ((x)->dim1Size)
+#define XAI_TILE3D_SET_DIM1(x, v)        ((x)->dim1Size = (v))
+#define XAI_TILE3D_GET_DIM1_PITCH(x)     ((x)->dim1Pitch)
+#define XAI_TILE3D_SET_DIM1_PITCH(x, v)  ((x)->dim1Pitch = (v))
+#define XAI_TILE3D_GET_DIM2(x)           ((x)->dim2Size)
+#define XAI_TILE3D_SET_DIM2(x, v)        ((x)->dim2Size = (v))
+#define XAI_TILE3D_GET_DIM2_PITCH(x)     ((x)->dim2Pitch)
+#define XAI_TILE3D_SET_DIM2_PITCH(x, v)  ((x)->dim2Pitch = (v))
+#define XAI_TILE3D_GET_DIM3(x)           ((x)->dim3Size)
+#define XAI_TILE3D_SET_DIM3(x, v)        ((x)->dim3Size = (v))
+#define XAI_TILE3D_GET_DATA_ORDER(x)     ((x)->dataOrder)
+#define XAI_TILE3D_SET_DATA_ORDER(x, v)  ((x)->dataOrder = (v))
+#define XAI_TILE3D_GET_DIM1_COORD(x)     ((x)->dim1Loc)
+#define XAI_TILE3D_SET_DIM1_COORD(x, v)  ((x)->dim1Loc = (v))
+#define XAI_TILE3D_GET_DIM2_COORD(x)     ((x)->dim2Loc)
+#define XAI_TILE3D_SET_DIM2_COORD(x, v)  ((x)->dim2Loc = (v))
+#define XAI_TILE3D_GET_DIM3_COORD(x)     ((x)->dim3Loc)
+#define XAI_TILE3D_SET_DIM3_COORD(x, v)  ((x)->dim3Loc = (v))
+#define XAI_TILE3D_GET_DIM1_EDGE1(x)     ((x)->dim1Edge1)
+#define XAI_TILE3D_SET_DIM1_EDGE1(x, v)  ((x)->dim1Edge1 = (v))
+#define XAI_TILE3D_GET_DIM1_EDGE2(x)     ((x)->dim1Edge2)
+#define XAI_TILE3D_SET_DIM1_EDGE2(x, v)  ((x)->dim1Edge2 = (v))
+#define XAI_TILE3D_GET_DIM2_EDGE1(x)     ((x)->dim2Edge1)
+#define XAI_TILE3D_SET_DIM2_EDGE1(x, v)  ((x)->dim2Edge1 = (v))
+#define XAI_TILE3D_GET_DIM2_EDGE2(x)     ((x)->dim2Edge2)
+#define XAI_TILE3D_SET_DIM2_EDGE2(x, v)  ((x)->dim2Edge2 = (v))
+#define XAI_TILE3D_GET_DIM3_EDGE1(x)     ((x)->dim3Edge1)
+#define XAI_TILE3D_SET_DIM3_EDGE1(x, v)  ((x)->dim3Edge1 = (v))
+#define XAI_TILE3D_GET_DIM3_EDGE2(x)     ((x)->dim3Edge2)
+#define XAI_TILE3D_SET_DIM3_EDGE2(x, v)  ((x)->dim3Edge2 = (v))
+
+/*****************************************
+*   Data type definitions
+*****************************************/
+#define XAI_TYPE_IS_TILE3D(type)  (!(((type) & (XAI_TYPE_TILE_MASK)) ^ XAI_TYPE_TILE3D_BITS))
+
+#define XAI_TILE3D_U4    (XAI_U4 | XAI_TYPE_TILE3D_BITS)
+#define XAI_TILE3D_U8    (XAI_U8 | XAI_TYPE_TILE3D_BITS)
+#define XAI_TILE3D_U16   (XAI_U16 | XAI_TYPE_TILE3D_BITS)
+#define XAI_TILE3D_U32   (XAI_U32 | XAI_TYPE_TILE3D_BITS)
+#define XAI_TILE3D_U64   (XAI_U64 | XAI_TYPE_TILE3D_BITS)
+#define XAI_TILE3D_U128  (XAI_U128 | XAI_TYPE_TILE3D_BITS)
+
+#define XAI_TILE3D_S4    (XAI_S4 | XAI_TYPE_TILE3D_BITS)
+#define XAI_TILE3D_S8    (XAI_S8 | XAI_TYPE_TILE3D_BITS)
+#define XAI_TILE3D_S16   (XAI_S16 | XAI_TYPE_TILE3D_BITS)
+#define XAI_TILE3D_S32   (XAI_S32 | XAI_TYPE_TILE3D_BITS)
+#define XAI_TILE3D_S64   (XAI_S64 | XAI_TYPE_TILE3D_BITS)
+#define XAI_TILE3D_S128  (XAI_S128 | XAI_TYPE_TILE3D_BITS)
+
+#define XAI_TILE3D_F8    (XAI_F8 | XAI_TYPE_TILE3D_BITS)
+#define XAI_TILE3D_F16   (XAI_F16 | XAI_TYPE_TILE3D_BITS)
+#define XAI_TILE3D_F32   (XAI_F32 | XAI_TYPE_TILE3D_BITS)
+#define XAI_TILE3D_F64   (XAI_F64 | XAI_TYPE_TILE3D_BITS)
+#define XAI_TILE3D_F128  (XAI_F128 | XAI_TYPE_TILE3D_BITS)
+
+/*****************************************
+*                   3D Frame Access Macros
+*****************************************/
+#define XAI_FRAME3D_GET_BUFF_PTR      XAI_FRAME_GET_BUFF_PTR
+#define XAI_FRAME3D_SET_BUFF_PTR      XAI_FRAME_SET_BUFF_PTR
+
+#define XAI_FRAME3D_GET_BUFF_SIZE     XAI_FRAME_GET_BUFF_SIZE
+#define XAI_FRAME3D_SET_BUFF_SIZE     XAI_FRAME_SET_BUFF_SIZE
+
+#define XAI_FRAME3D_GET_DATA_PTR      XAI_FRAME_GET_DATA_PTR
+#define XAI_FRAME3D_SET_DATA_PTR      XAI_FRAME_SET_DATA_PTR
+
+#define XAI_FRAME3D_GET_PIXEL_RES     XAI_FRAME_GET_PIXEL_RES
+#define XAI_FRAME3D_SET_PIXEL_RES     XAI_FRAME_SET_PIXEL_RES
+
+#define XAI_FRAME3D_GET_PIXEL_FORMAT  XAI_FRAME_GET_PIXEL_FORMAT
+#define XAI_FRAME3D_SET_PIXEL_FORMAT  XAI_FRAME_SET_PIXEL_FORMAT
+
+#define XAI_FRAME3D_GET_PADDING_TYPE  XAI_FRAME_GET_PADDING_TYPE
+#define XAI_FRAME3D_SET_PADDING_TYPE  XAI_FRAME_SET_PADDING_TYPE
+
+/*****************************************
+*                   3D Tile Access Macros
+*****************************************/
+#define XAI_TILE3D_GET_BUFF_PTR        XAI_TILE2D_GET_BUFF_PTR
+#define XAI_TILE3D_SET_BUFF_PTR        XAI_TILE2D_SET_BUFF_PTR
+#define XAI_TILE3D_SET_BUFF_PTR_COEFF  XAI_TILE2D_SET_BUFF_PTR_COEFF
+
+#define XAI_TILE3D_GET_BUFF_SIZE       XAI_TILE2D_GET_BUFF_SIZE
+#define XAI_TILE3D_SET_BUFF_SIZE       XAI_TILE2D_SET_BUFF_SIZE
+
+#define XAI_TILE3D_GET_DATA_PTR        XAI_TILE2D_GET_DATA_PTR
+#define XAI_TILE3D_SET_DATA_PTR        XAI_TILE2D_SET_DATA_PTR
+#define XAI_TILE3D_SET_DATA_PTR_COEFF  XAI_TILE2D_SET_DATA_PTR_COEFF
+
+#define XAI_TILE3D_GET_STATUS_FLAGS    XAI_TILE2D_GET_STATUS_FLAGS
+#define XAI_TILE3D_SET_STATUS_FLAGS    XAI_TILE2D_SET_STATUS_FLAGS
+
+#define XAI_TILE3D_GET_TYPE            XAI_TILE2D_GET_TYPE
+#define XAI_TILE3D_SET_TYPE            XAI_TILE2D_SET_TYPE
+
+#define XAI_TILE3D_GET_ELEMENT_TYPE    XAI_TILE2D_GET_ELEMENT_TYPE
+#define XAI_TILE3D_GET_ELEMENT_SIZE    XAI_TILE2D_GET_ELEMENT_SIZE
+#define XAI_TILE3D_IS_TILE             XAI_TILE2D_IS_TILE2D
+
+#define XAI_TILE3D_GET_FRAME_PTR(pTile3D)            ((pTile3D)->pFrame)
+#define XAI_TILE3D_SET_FRAME_PTR(pTile3D, ptrFrame)  (pTile3D)->pFrame = ((xai_pFrame3D) (ptrFrame))
+
+#define XAI_TILE3D_CHECK_STATUS_FLAGS_DMA_ONGOING  XAI_TILE2D_CHECK_STATUS_FLAGS_DMA_ONGOING
+
+/***********************************
+*              Other Marcos
+***********************************/
+#define XAI_TILE3D_CHECK_VIRTUAL_FRAME   XAI_TILE2D_CHECK_VIRTUAL_FRAME
+#define XAI_FRAME3D_CHECK_VIRTUAL_FRAME  XAI_FRAME_CHECK_VIRTUAL_FRAME
+
+typedef enum
+{
+  XAI_TILE_UNALIGNED,
+  XAI_EDGE_ALIGNED_32,
+  XAI_DATA_ALIGNED_32,
+  XAI_EDGE_ALIGNED_64,
+  XAI_DATA_ALIGNED_64,
+  EDGE_ALIGNED_128,
+  DATA_ALIGNED_128,
+} xai_buffer_align_type_t;
+
+// Only Q8, 240 and 341 uses alignment = 127. for P6,P1 and Q7 like dsps alignment = 127 is not supported
+#define XAI_SETUP_TILE3D(type, pTile, pBuf, pFrame, bufSize, dim1Size, dim2Size, dim3Size, dim1Pitch, dim2Pitch,                 \
+                         dim1Edge1, dim1Edge2, dim2Edge1, dim2Edge2, dim3Edge1, dim3Edge2, dim1Loc, dim2Loc, dim3Loc, dataOrder, \
+                         alignType)                                                                                              \
+  {                                                                                                                              \
+    XAI_TILE3D_SET_TYPE(pTile, type);                                                                                            \
+    XAI_TILE3D_SET_FRAME_PTR(pTile, pFrame);                                                                                     \
+    XAI_TILE3D_SET_BUFF_PTR(pTile, pBuf);                                                                                        \
+    XAI_TILE3D_SET_BUFF_SIZE(pTile, bufSize);                                                                                    \
+    XAI_TILE3D_SET_DIM1(pTile, dim1Size);                                                                                        \
+    XAI_TILE3D_SET_DIM2(pTile, dim2Size);                                                                                        \
+    XAI_TILE3D_SET_DIM3(pTile, dim3Size);                                                                                        \
+    XAI_TILE3D_SET_DIM1_PITCH(pTile, dim1Pitch);                                                                                 \
+    XAI_TILE3D_SET_DIM2_PITCH(pTile, dim2Pitch);                                                                                 \
+    uint8_t *edgePtr  = (uint8_t *) pBuf, *dataPtr;                                                                              \
+    int32_t alignment = 127;                                                                                                     \
+    if ((alignType == XAI_EDGE_ALIGNED_64) || (alignType == XAI_DATA_ALIGNED_64)) { alignment = 63; }                            \
+    if ((alignType == XAI_EDGE_ALIGNED_32) || (alignType == XAI_DATA_ALIGNED_32)) { alignment = 31; }                            \
+    if ((alignType == XAI_EDGE_ALIGNED_32) || (alignType == XAI_EDGE_ALIGNED_64) || (alignType == EDGE_ALIGNED_128))             \
+    {                                                                                                                            \
+      edgePtr = (uint8_t *) (((uintptr_t) (pBuf) + alignment) & (~alignment));                                                   \
+    }                                                                                                                            \
+    XAI_TILE3D_SET_DATA_PTR(pTile, edgePtr + ((dim3Edge1) * (dim2Pitch) +                                                        \
+                                              (dim2Edge1) * (dim1Pitch) + (dim1Edge1)) * XAI_TILE3D_GET_ELEMENT_SIZE(pTile));    \
+    if ((alignType == XAI_DATA_ALIGNED_32) || (alignType == XAI_DATA_ALIGNED_64) || (alignType == DATA_ALIGNED_128))             \
+    {                                                                                                                            \
+      dataPtr = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(pTile);                                                                      \
+      dataPtr = (uint8_t *) (((uintptr_t) (dataPtr) + alignment) & (~alignment));                                                \
+      XAI_TILE3D_SET_DATA_PTR(pTile, dataPtr);                                                                                   \
+    }                                                                                                                            \
+    XAI_TILE3D_SET_DIM1_EDGE1(pTile, dim1Edge1);                                                                                 \
+    XAI_TILE3D_SET_DIM1_EDGE2(pTile, dim1Edge2);                                                                                 \
+    XAI_TILE3D_SET_DIM2_EDGE1(pTile, dim2Edge1);                                                                                 \
+    XAI_TILE3D_SET_DIM2_EDGE2(pTile, dim2Edge2);                                                                                 \
+    XAI_TILE3D_SET_DIM3_EDGE1(pTile, dim3Edge1);                                                                                 \
+    XAI_TILE3D_SET_DIM3_EDGE2(pTile, dim3Edge2);                                                                                 \
+    XAI_TILE3D_SET_DIM1_COORD(pTile, dim1Loc);                                                                                   \
+    XAI_TILE3D_SET_DIM2_COORD(pTile, dim2Loc);                                                                                   \
+    XAI_TILE3D_SET_DIM3_COORD(pTile, dim3Loc);                                                                                   \
+    XAI_TILE3D_SET_DATA_ORDER(pTile, dataOrder);                                                                                 \
+  }
+
+#define XAI_SETUP_FRAME3D(pFrame, pFrameBuffer, bufSize, dim1Size, dim2Size, dim3Size, dim1Pitch, dim2Pitch,                    \
+                          dim1Edge1, dim1Edge2, dim2Edge1, dim2Edge2, dim3Edge1, dim3Edge2, pixRes, pixPackFormat, paddingType, \
+                          dataOrder)                                                                                            \
+  {                                                                                                                             \
+    XAI_FRAME3D_SET_BUFF_PTR(pFrame, pFrameBuffer);                                                                             \
+    XAI_FRAME3D_SET_BUFF_SIZE(pFrame, bufSize);                                                                                 \
+    XAI_FRAME3D_SET_DIM1(pFrame, dim1Size);                                                                                     \
+    XAI_FRAME3D_SET_DIM2(pFrame, dim2Size);                                                                                     \
+    XAI_FRAME3D_SET_DIM3(pFrame, dim3Size);                                                                                     \
+    XAI_FRAME3D_SET_DIM1_PITCH(pFrame, dim1Pitch);                                                                              \
+    XAI_FRAME3D_SET_DIM2_PITCH(pFrame, dim2Pitch);                                                                              \
+    XAI_FRAME3D_SET_DATA_PTR(pFrame, pFrameBuffer + ((dim3Edge1) * (dim2Pitch) +                                                \
+                                                     (dim2Edge1) * (dim1Pitch) + (dim1Edge1)) * pixRes);                        \
+    XAI_FRAME3D_SET_DIM1_EDGE1(pFrame, dim1Edge1);                                                                              \
+    XAI_FRAME3D_SET_DIM1_EDGE2(pFrame, dim1Edge2);                                                                              \
+    XAI_FRAME3D_SET_DIM2_EDGE1(pFrame, dim2Edge1);                                                                              \
+    XAI_FRAME3D_SET_DIM2_EDGE2(pFrame, dim2Edge2);                                                                              \
+    XAI_FRAME3D_SET_DIM3_EDGE1(pFrame, dim3Edge1);                                                                              \
+    XAI_FRAME3D_SET_DIM3_EDGE2(pFrame, dim3Edge2);                                                                              \
+    XAI_FRAME3D_SET_PIXEL_RES(pFrame, pixRes);                                                                                  \
+    XAI_FRAME3D_SET_PIXEL_FORMAT(pFrame, pixPackFormat);                                                                        \
+    XAI_FRAME3D_SET_PADDING_TYPE(pFrame, paddingType);                                                                          \
+    XAI_FRAME3D_SET_DATA_ORDER(pFrame, dataOrder);                                                                              \
+  }
+
+#define XAI_COPY_FRAME3D_TO_TILE3D(frame, tile)         {               \
+    XAI_TILE3D_SET_DIM1(tile, XAI_FRAME3D_GET_DIM1(frame));             \
+    XAI_TILE3D_SET_DIM1_PITCH(tile, XAI_FRAME3D_GET_DIM1_PITCH(frame)); \
+    XAI_TILE3D_SET_DIM1_EDGE1(tile, XAI_FRAME3D_GET_DIM1_EDGE1(frame)); \
+    XAI_TILE3D_SET_DIM1_EDGE2(tile, XAI_FRAME3D_GET_DIM1_EDGE2(frame)); \
+    XAI_TILE3D_SET_DIM2(tile, XAI_FRAME3D_GET_DIM2(frame));             \
+    XAI_TILE3D_SET_DIM2_PITCH(tile, XAI_FRAME3D_GET_DIM2_PITCH(frame)); \
+    XAI_TILE3D_SET_DIM2_EDGE1(tile, XAI_FRAME3D_GET_DIM2_EDGE1(frame)); \
+    XAI_TILE3D_SET_DIM2_EDGE2(tile, XAI_FRAME3D_GET_DIM2_EDGE2(frame)); \
+    XAI_TILE3D_SET_DIM3(tile, XAI_FRAME3D_GET_DIM3(frame));             \
+    XAI_TILE3D_SET_DIM3_EDGE1(tile, XAI_FRAME3D_GET_DIM3_EDGE1(frame)); \
+    XAI_TILE3D_SET_DIM3_EDGE2(tile, XAI_FRAME3D_GET_DIM3_EDGE2(frame)); \
+    XAI_TILE3D_SET_DATA_PTR(tile, XAI_FRAME3D_GET_DATA_PTR(frame));     \
+    XAI_TILE3D_SET_DATA_ORDER(tile, XAI_FRAME3D_GET_DATA_ORDER(frame)); \
+}
+
+#define XAI_COPY_FRAME3D_TO_FRAME3D(frameIn, frameOut)  {                      \
+    XAI_FRAME3D_SET_DIM1(frameOut, XAI_FRAME3D_GET_DIM1(frameIn));             \
+    XAI_FRAME3D_SET_DIM1_PITCH(frameOut, XAI_FRAME3D_GET_DIM1_PITCH(frameIn)); \
+    XAI_FRAME3D_SET_DIM1_EDGE1(frameOut, XAI_FRAME3D_GET_DIM1_EDGE1(frameIn)); \
+    XAI_FRAME3D_SET_DIM1_EDGE2(frameOut, XAI_FRAME3D_GET_DIM1_EDGE2(frameIn)); \
+    XAI_FRAME3D_SET_DIM2(frameOut, XAI_FRAME3D_GET_DIM2(frameIn));             \
+    XAI_FRAME3D_SET_DIM2_PITCH(frameOut, XAI_FRAME3D_GET_DIM2_PITCH(frameIn)); \
+    XAI_FRAME3D_SET_DIM2_EDGE1(frameOut, XAI_FRAME3D_GET_DIM2_EDGE1(frameIn)); \
+    XAI_FRAME3D_SET_DIM2_EDGE2(frameOut, XAI_FRAME3D_GET_DIM2_EDGE2(frameIn)); \
+    XAI_FRAME3D_SET_DIM3(frameOut, XAI_FRAME3D_GET_DIM3(frameIn));             \
+    XAI_FRAME3D_SET_DIM3_EDGE1(frameOut, XAI_FRAME3D_GET_DIM2_EDGE1(frameIn)); \
+    XAI_FRAME3D_SET_DIM3_EDGE2(frameOut, XAI_FRAME3D_GET_DIM2_EDGE2(frameIn)); \
+    XAI_FRAME3D_SET_DATA_PTR(frameOut, XAI_FRAME3D_GET_DATA_PTR(frameIn));     \
+    XAI_FRAME3D_SET_DATA_ORDER(frameOut, XAI_FRAME3D_GET_DATA_ORDER(frameIn)); \
+    XAI_FRAME3D_SET_PIXEL_RES(frameOut, XAI_FRAME3D_GET_PIXEL_RES(frameIn));   \
+}
+
+#define XAI_COPY_TILE3D_TO_TILE3D(tileIn, tileOut)      {                  \
+    XAI_TILE3D_SET_DIM1(tileOut, XAI_TILE3D_GET_DIM1(tileIn));             \
+    XAI_TILE3D_SET_DIM1_PITCH(tileOut, XAI_TILE3D_GET_DIM1_PITCH(tileIn)); \
+    XAI_TILE3D_SET_DIM1_EDGE1(tileOut, XAI_TILE3D_GET_DIM1_EDGE1(tileIn)); \
+    XAI_TILE3D_SET_DIM1_EDGE2(tileOut, XAI_TILE3D_GET_DIM1_EDGE2(tileIn)); \
+    XAI_TILE3D_SET_DIM2(tileOut, XAI_TILE3D_GET_DIM2(tileIn));             \
+    XAI_TILE3D_SET_DIM2_PITCH(tileOut, XAI_TILE3D_GET_DIM2_PITCH(tileIn)); \
+    XAI_TILE3D_SET_DIM2_EDGE1(tileOut, XAI_TILE3D_GET_DIM2_EDGE1(tileIn)); \
+    XAI_TILE3D_SET_DIM2_EDGE2(tileOut, XAI_TILE3D_GET_DIM2_EDGE2(tileIn)); \
+    XAI_TILE3D_SET_DIM3(tileOut, XAI_TILE3D_GET_DIM3(tileIn));             \
+    XAI_TILE3D_SET_DIM3_EDGE1(tileOut, XAI_TILE3D_GET_DIM3_EDGE1(tileIn)); \
+    XAI_TILE3D_SET_DIM3_EDGE2(tileOut, XAI_TILE3D_GET_DIM3_EDGE2(tileIn)); \
+    XAI_TILE3D_SET_DATA_PTR(tileOut, XAI_TILE3D_GET_DATA_PTR(tileIn));     \
+    XAI_TILE3D_SET_DATA_ORDER(tileOut, XAI_TILE3D_GET_DATA_ORDER(tileIn)); \
+}
+
+// Assumes 8 bit pixRes and Edge1 = Edge2
+#define XAI_TILE3D_UPDATE_EDGE_DIM1(pTile, newEdgeSize)                  \
+  {                                                                      \
+    uint16_t currEdgeSize = (uint16_t) XAI_TILE3D_GET_DIM1_EDGE1(pTile); \
+    uint32_t dim1Pitch    = (uint32_t) XAI_TILE3D_GET_DIM1_PITCH(pTile); \
+    uintptr_t dataU32     = (uintptr_t) XAI_TILE3D_GET_DATA_PTR(pTile);  \
+    dataU32 = dataU32 + newEdgeSize - currEdgeSize;                      \
+    XAI_TILE3D_SET_DATA_PTR(pTile, (void *) dataU32);                    \
+    XAI_TILE3D_SET_DIM1_EDGE1(pTile, newEdgeSize);                       \
+    XAI_TILE3D_SET_DIM1_EDGE2(pTile, newEdgeSize);                       \
+    XAI_TILE3D_SET_DIM1(pTile, dim1Pitch - 2 * newEdgeSize);             \
+  }
+
+// Assumes 8 bit pixRes and Edge1 = Edge2
+#define XAI_TILE3D_UPDATE_EDGE_DIM2(pTile, newEdgeSize)                      \
+  {                                                                          \
+    uint16_t currEdgeSize = (uint16_t) XAI_TILE3D_GET_DIM2_EDGE1(pTile);     \
+    uint32_t dim1Pitch    = (uint32_t) XAI_TILE3D_GET_DIM1_PITCH(pTile);     \
+    uint16_t dim2Size     = (uint16_t) XAI_TILE3D_GET_DIM2(pTile);           \
+    uintptr_t dataU32     = (uintptr_t) XAI_TILE3D_GET_DATA_PTR(pTile);      \
+    dataU32 = dataU32 + dim1Pitch * (newEdgeSize - currEdgeSize);            \
+    XAI_TILE3D_SET_DATA_PTR(pTile, (void *) dataU32);                        \
+    XAI_TILE3D_SET_DIM2_EDGE1(pTile, newEdgeSize);                           \
+    XAI_TILE3D_SET_DIM2_EDGE2(pTile, newEdgeSize);                           \
+    XAI_TILE3D_SET_DIM2(pTile, dim2Size + 2 * (currEdgeSize - newEdgeSize)); \
+  }
+
+// Assumes 8 bit pixRes and Edge1 = Edge2
+#define XAI_TILE3D_UPDATE_EDGE_DIM3(pTile, newEdgeSize)                      \
+  {                                                                          \
+    uint16_t currEdgeSize = (uint16_t) XAI_TILE3D_GET_DIM3_EDGE1(pTile);     \
+    uint32_t dim2Pitch    = (uint32_t) XAI_TILE3D_GET_DIM2_PITCH(pTile);     \
+    uint16_t dim3Size     = (uint16_t) XAI_TILE3D_GET_DIM3(pTile);           \
+    uintptr_t dataU32     = (uintptr_t) XAI_TILE3D_GET_DATA_PTR(pTile);      \
+    dataU32 = dataU32 + dim2Pitch * (newEdgeSize - currEdgeSize);            \
+    XAI_TILE3D_SET_DATA_PTR(pTile, (void *) dataU32);                        \
+    XAI_TILE3D_SET_DIM3_EDGE1(pTile, newEdgeSize);                           \
+    XAI_TILE3D_SET_DIM3_EDGE2(pTile, newEdgeSize);                           \
+    XAI_TILE3D_SET_DIM3(pTile, dim3Size + 2 * (currEdgeSize - newEdgeSize)); \
+  }
+
+#define XAI_TILE3D_UPDATE_DIMENSIONS(pTile, dim1Loc, dim2Loc, dim3Loc, dim1Size, dim2Size, dim3Size, \
+                                     dim1Pitch, dim2Pitch)                                           \
+  {                                                                                                  \
+    XAI_TILE3D_SET_DIM1_COORD(pTile, dim1Loc);                                                       \
+    XAI_TILE3D_SET_DIM2_COORD(pTile, dim2Loc);                                                       \
+    XAI_TILE3D_SET_DIM3_COORD(pTile, dim3Loc);                                                       \
+    XAI_TILE3D_SET_DIM1(pTile, dim1Size);                                                            \
+    XAI_TILE3D_SET_DIM2(pTile, dim2Size);                                                            \
+    XAI_TILE3D_SET_DIM3(pTile, dim3Size);                                                            \
+    XAI_TILE3D_SET_DIM1_PITCH(pTile, dim1Pitch);                                                     \
+    XAI_TILE3D_SET_DIM2_PITCH(pTile, dim2Pitch);                                                     \
+  }
+
+/******************************************************************************************************************
+*
+*                    4D definitions - extension of 3D definitions
+*
+* ****************************************************************************************************************/
+typedef struct xai_frame4DStruct
+{
+  void               *pFrameBuff;
+  uint32_t           frameBuffSize;
+  void               *pFrameData;
+  int32_t            dim1Size;
+  int32_t            dim2Size;
+  int32_t            dim1Pitch; // pitch in width dimension
+  uint8_t            pixelRes;  // in bits
+  uint8_t            pixelPackFormat;
+  uint16_t           dim1Edge1;
+  uint16_t           dim1Edge2;
+  uint16_t           dim2Edge1;
+  uint16_t           dim2Edge2;
+  uint16_t           dim3Edge1;
+  uint16_t           dim3Edge2;
+  uint8_t            paddingType;
+  // new fields
+  int32_t            dim2Pitch;
+  int32_t            dim3Size;
+  xai_cnn_data_order dataOrder; // WHD, DWH, WHDN, NWHD, etc.
+  // new fields
+  int32_t            dim3Pitch;
+  int32_t            dim4Size;
+} xai_frame4D, *xai_pFrame4D;
+
+// new access macros
+#define XAI_FRAME4D_GET_DIM1                 XAI_FRAME3D_GET_DIM1
+#define XAI_FRAME4D_SET_DIM1                 XAI_FRAME3D_SET_DIM1
+#define XAI_FRAME4D_GET_DIM1_PITCH           XAI_FRAME3D_GET_DIM1_PITCH
+#define XAI_FRAME4D_SET_DIM1_PITCH           XAI_FRAME3D_SET_DIM1_PITCH
+#define XAI_FRAME4D_GET_DIM1_PITCH_IN_BYTES  XAI_FRAME3D_GET_DIM1_PITCH_IN_BYTES
+#define XAI_FRAME4D_GET_DIM2                 XAI_FRAME3D_GET_DIM2
+#define XAI_FRAME4D_SET_DIM2                 XAI_FRAME3D_SET_DIM2
+#define XAI_FRAME4D_GET_DIM2_PITCH           XAI_FRAME3D_GET_DIM2_PITCH
+#define XAI_FRAME4D_SET_DIM2_PITCH           XAI_FRAME3D_SET_DIM2_PITCH
+#define XAI_FRAME4D_GET_DIM2_PITCH_IN_BYTES  XAI_FRAME3D_GET_DIM2_PITCH_IN_BYTES
+#define XAI_FRAME4D_GET_DIM3                 XAI_FRAME3D_GET_DIM3
+#define XAI_FRAME4D_SET_DIM3                 XAI_FRAME3D_SET_DIM3
+#define XAI_FRAME4D_GET_DATA_ORDER           XAI_FRAME3D_GET_DATA_ORDER
+#define XAI_FRAME4D_SET_DATA_ORDER           XAI_FRAME3D_SET_DATA_ORDER
+#define XAI_FRAME4D_GET_DIM1_EDGE1           XAI_FRAME3D_GET_DIM1_EDGE1
+#define XAI_FRAME4D_SET_DIM1_EDGE1           XAI_FRAME3D_SET_DIM1_EDGE1
+#define XAI_FRAME4D_GET_DIM1_EDGE2           XAI_FRAME3D_GET_DIM1_EDGE2
+#define XAI_FRAME4D_SET_DIM1_EDGE2           XAI_FRAME3D_SET_DIM1_EDGE2
+#define XAI_FRAME4D_GET_DIM2_EDGE1           XAI_FRAME3D_GET_DIM2_EDGE1
+#define XAI_FRAME4D_SET_DIM2_EDGE1           XAI_FRAME3D_SET_DIM2_EDGE1
+#define XAI_FRAME4D_GET_DIM2_EDGE2           XAI_FRAME3D_GET_DIM2_EDGE2
+#define XAI_FRAME4D_SET_DIM2_EDGE2           XAI_FRAME3D_SET_DIM2_EDGE2
+#define XAI_FRAME4D_GET_DIM4(x)           ((x)->dim4Size)
+#define XAI_FRAME4D_SET_DIM4(x, v)        ((x)->dim4Size = (v))
+#define XAI_FRAME4D_GET_DIM3_PITCH(x)     ((x)->dim3Pitch)
+#define XAI_FRAME4D_SET_DIM3_PITCH(x, v)  ((x)->dim3Pitch = (v))
+#define XAI_FRAME4D_GET_DIM3_EDGE1(x)     ((x)->dim3Edge1)
+#define XAI_FRAME4D_SET_DIM3_EDGE1(x, v)  ((x)->dim3Edge1 = (v))
+#define XAI_FRAME4D_GET_DIM3_EDGE2(x)     ((x)->dim3Edge2)
+#define XAI_FRAME4D_SET_DIM3_EDGE2(x, v)  ((x)->dim3Edge2 = (v))
+
+// 4D tile
+#define XAI_TILE4D_FIELDS                                                                  \
+  uint32_t bufferSize;                                                                     \
+  int32_t dim1Size;                                                                        \
+  int32_t dim1Pitch;                                                                       \
+  uint32_t status; /*Currently not used, planned to be obsolete*/                          \
+  uint16_t type;                                                                           \
+  int32_t dim2Size;                                                                        \
+  xai_frame4D *pFrame;                                                                     \
+  int32_t dim1Loc;  /* dim1-loc of top-left active pixel in src frame */                   \
+  int32_t dim2Loc;  /* dim2-loc of top-left active pixel in src frame */                   \
+  uint16_t dim1Edge1;                                                                      \
+  uint16_t dim2Edge1;                                                                      \
+  uint16_t dim1Edge2;                                                                      \
+  uint16_t dim2Edge2;                                                                      \
+  /* new fields */                                                                         \
+  int32_t dim2Pitch;                                                                       \
+  int32_t dim3Size;                                                                        \
+  xai_cnn_data_order dataOrder;                                                            \
+  int32_t dim3Loc;  /* dim3-loc of top-left active pixel in src frame */                   \
+  uint16_t dim3Edge1;                                                                      \
+  uint16_t dim3Edge2;                                                                      \
+  /* new fields */                                                                         \
+  int32_t dim3Pitch;                                                                       \
+  int32_t dim4Size;  /* 4th dimension is num for lack of better term */                    \
+  int32_t dim4Loc;   /* dim4-loc of top-left active pixel in src frame */                  \
+  /* Number of PTILES in a MEMTILE along a particular dimension. Used for MEMTILES only */ \
+  int16_t numPtilesDim1;                                                                   \
+  int16_t numPtilesDim2;                                                                   \
+  int16_t numPtilesDim3;
+
+typedef struct xai_tile4DStruct
+{
+  void *pBuffer;
+  void *pData;
+  XAI_TILE4D_FIELDS
+#ifdef GLOW_BUILD
+  int8_t printFlag;
+  const char *nodeName;
+  const char *fileName;
+#endif // GLOW_BUILD
+} xai_tile4D, *xai_pTile4D;
+
+typedef struct xai_tile4DStruct_64
+{
+  uint64_t pBuffer;
+  uint64_t pData;
+  XAI_TILE4D_FIELDS
+#ifdef GLOW_BUILD
+  int8_t printFlag;
+  const char *nodeName;
+  const char *fileName;
+#endif // GLOW_BUILD
+} xai_tile4D_64, *xai_pTile4D_64;
+
+#define XAI_TILE4D_GET_DIM1        XAI_TILE3D_GET_DIM1
+#define XAI_TILE4D_SET_DIM1        XAI_TILE3D_SET_DIM1
+#define XAI_TILE4D_GET_DIM1_PITCH  XAI_TILE3D_GET_DIM1_PITCH
+#define XAI_TILE4D_SET_DIM1_PITCH  XAI_TILE3D_SET_DIM1_PITCH
+#define XAI_TILE4D_GET_DIM2        XAI_TILE3D_GET_DIM2
+#define XAI_TILE4D_SET_DIM2        XAI_TILE3D_SET_DIM2
+#define XAI_TILE4D_GET_DIM2_PITCH  XAI_TILE3D_GET_DIM2_PITCH
+#define XAI_TILE4D_SET_DIM2_PITCH  XAI_TILE3D_SET_DIM2_PITCH
+#define XAI_TILE4D_GET_DIM3        XAI_TILE3D_GET_DIM3
+#define XAI_TILE4D_SET_DIM3        XAI_TILE3D_SET_DIM3
+#define XAI_TILE4D_GET_DIM3_PITCH(x)     ((x)->dim3Pitch)
+#define XAI_TILE4D_SET_DIM3_PITCH(x, v)  ((x)->dim3Pitch = (v))
+#define XAI_TILE4D_GET_DIM4(x)           ((x)->dim4Size)
+#define XAI_TILE4D_SET_DIM4(x, v)        ((x)->dim4Size = (v))
+#define XAI_TILE4D_GET_DIM1_EDGE1  XAI_TILE3D_GET_DIM1_EDGE1
+#define XAI_TILE4D_SET_DIM1_EDGE1  XAI_TILE3D_SET_DIM1_EDGE1
+#define XAI_TILE4D_GET_DIM1_EDGE2  XAI_TILE3D_GET_DIM1_EDGE2
+#define XAI_TILE4D_SET_DIM1_EDGE2  XAI_TILE3D_SET_DIM1_EDGE2
+#define XAI_TILE4D_GET_DIM2_EDGE1  XAI_TILE3D_GET_DIM2_EDGE1
+#define XAI_TILE4D_SET_DIM2_EDGE1  XAI_TILE3D_SET_DIM2_EDGE1
+#define XAI_TILE4D_GET_DIM2_EDGE2  XAI_TILE3D_GET_DIM2_EDGE2
+#define XAI_TILE4D_SET_DIM2_EDGE2  XAI_TILE3D_SET_DIM2_EDGE2
+#define XAI_TILE4D_GET_DIM3_EDGE1  XAI_TILE3D_GET_DIM3_EDGE1
+#define XAI_TILE4D_SET_DIM3_EDGE1  XAI_TILE3D_SET_DIM3_EDGE1
+#define XAI_TILE4D_GET_DIM3_EDGE2  XAI_TILE3D_GET_DIM3_EDGE2
+#define XAI_TILE4D_SET_DIM3_EDGE2  XAI_TILE3D_SET_DIM3_EDGE2
+#define XAI_TILE4D_GET_DATA_ORDER  XAI_TILE3D_GET_DATA_ORDER
+#define XAI_TILE4D_SET_DATA_ORDER  XAI_TILE3D_SET_DATA_ORDER
+#define XAI_TILE4D_GET_DIM1_COORD  XAI_TILE3D_GET_DIM1_COORD
+#define XAI_TILE4D_SET_DIM1_COORD  XAI_TILE3D_SET_DIM1_COORD
+#define XAI_TILE4D_GET_DIM2_COORD  XAI_TILE3D_GET_DIM2_COORD
+#define XAI_TILE4D_SET_DIM2_COORD  XAI_TILE3D_SET_DIM2_COORD
+#define XAI_TILE4D_GET_DIM3_COORD  XAI_TILE3D_GET_DIM3_COORD
+#define XAI_TILE4D_SET_DIM3_COORD  XAI_TILE3D_SET_DIM3_COORD
+#define XAI_TILE4D_GET_DIM4_COORD(x)     ((x)->dim4Loc)
+#define XAI_TILE4D_SET_DIM4_COORD(x, v)  ((x)->dim4Loc = (v))
+#ifdef GLOW_BUILD
+#define XAI_TILE4D_GET_PRINT_FLAG(x)     ((x)->printFlag)
+#define XAI_TILE4D_SET_PRINT_FLAG(x, v)  ((x)->printFlag = (v))
+#define XAI_TILE4D_GET_NODE_NAME(x)      ((x)->nodeName)
+#define XAI_TILE4D_SET_NODE_NAME(x, v)   ((x)->nodeName = (v))
+#define XAI_TILE4D_GET_FILE_NAME(x)      ((x)->fileName)
+#define XAI_TILE4D_SET_FILE_NAME(x, v)   ((x)->fileName = (v))
+#endif
+
+/*****************************************
+*   Data type definitions
+*****************************************/
+#define XAI_TYPE_IS_TILE4D(type)  (!(((type) & (XAI_TYPE_TILE_MASK)) ^ XAI_TYPE_TILE4D_BITS))
+
+#define XAI_TILE4D_U4    (XAI_U4 | XAI_TYPE_TILE4D_BITS)
+#define XAI_TILE4D_U8    (XAI_U8 | XAI_TYPE_TILE4D_BITS)
+#define XAI_TILE4D_U16   (XAI_U16 | XAI_TYPE_TILE4D_BITS)
+#define XAI_TILE4D_U32   (XAI_U32 | XAI_TYPE_TILE4D_BITS)
+#define XAI_TILE4D_U64   (XAI_U64 | XAI_TYPE_TILE4D_BITS)
+#define XAI_TILE4D_U128  (XAI_U128 | XAI_TYPE_TILE4D_BITS)
+
+#define XAI_TILE4D_S4    (XAI_S8 | XAI_TYPE_TILE4D_BITS)
+#define XAI_TILE4D_S8    (XAI_S8 | XAI_TYPE_TILE4D_BITS)
+#define XAI_TILE4D_S16   (XAI_S16 | XAI_TYPE_TILE4D_BITS)
+#define XAI_TILE4D_S32   (XAI_S32 | XAI_TYPE_TILE4D_BITS)
+#define XAI_TILE4D_S64   (XAI_S64 | XAI_TYPE_TILE4D_BITS)
+#define XAI_TILE4D_S128  (XAI_S128 | XAI_TYPE_TILE4D_BITS)
+
+#define XAI_TILE4D_F8    (XAI_F8 | XAI_TYPE_TILE4D_BITS)
+#define XAI_TILE4D_F16   (XAI_F16 | XAI_TYPE_TILE4D_BITS)
+#define XAI_TILE4D_F32   (XAI_F32 | XAI_TYPE_TILE4D_BITS)
+#define XAI_TILE4D_F64   (XAI_F64 | XAI_TYPE_TILE4D_BITS)
+#define XAI_TILE4D_F128  (XAI_F128 | XAI_TYPE_TILE4D_BITS)
+
+/*****************************************
+*                   4D Frame Access Macros
+*****************************************/
+#define XAI_FRAME4D_GET_BUFF_PTR      XAI_FRAME_GET_BUFF_PTR
+#define XAI_FRAME4D_SET_BUFF_PTR      XAI_FRAME_SET_BUFF_PTR
+
+#define XAI_FRAME4D_GET_BUFF_SIZE     XAI_FRAME_GET_BUFF_SIZE
+#define XAI_FRAME4D_SET_BUFF_SIZE     XAI_FRAME_SET_BUFF_SIZE
+
+#define XAI_FRAME4D_GET_DATA_PTR      XAI_FRAME_GET_DATA_PTR
+#define XAI_FRAME4D_SET_DATA_PTR      XAI_FRAME_SET_DATA_PTR
+
+#define XAI_FRAME4D_GET_PIXEL_RES     XAI_FRAME_GET_PIXEL_RES
+#define XAI_FRAME4D_SET_PIXEL_RES     XAI_FRAME_SET_PIXEL_RES
+
+#define XAI_FRAME4D_GET_PIXEL_FORMAT  XAI_FRAME_GET_PIXEL_FORMAT
+#define XAI_FRAME4D_SET_PIXEL_FORMAT  XAI_FRAME_SET_PIXEL_FORMAT
+
+#define XAI_FRAME4D_GET_PADDING_TYPE  XAI_FRAME_GET_PADDING_TYPE
+#define XAI_FRAME4D_SET_PADDING_TYPE  XAI_FRAME_SET_PADDING_TYPE
+
+/*****************************************
+*                   4D Tile Access Macros
+*****************************************/
+#define XAI_TILE4D_GET_BUFF_PTR        XAI_TILE2D_GET_BUFF_PTR
+#define XAI_TILE4D_SET_BUFF_PTR        XAI_TILE2D_SET_BUFF_PTR
+#define XAI_TILE4D_SET_BUFF_PTR_COEFF  XAI_TILE2D_SET_BUFF_PTR_COEFF
+
+#define XAI_TILE4D_GET_BUFF_SIZE       XAI_TILE2D_GET_BUFF_SIZE
+#define XAI_TILE4D_SET_BUFF_SIZE       XAI_TILE2D_SET_BUFF_SIZE
+
+#define XAI_TILE4D_GET_DATA_PTR        XAI_TILE2D_GET_DATA_PTR
+#define XAI_TILE4D_SET_DATA_PTR        XAI_TILE2D_SET_DATA_PTR
+#define XAI_TILE4D_SET_DATA_PTR_COEFF  XAI_TILE2D_SET_DATA_PTR_COEFF
+
+#define XAI_TILE4D_GET_STATUS_FLAGS    XAI_TILE2D_GET_STATUS_FLAGS
+#define XAI_TILE4D_SET_STATUS_FLAGS    XAI_TILE2D_SET_STATUS_FLAGS
+
+#define XAI_TILE4D_GET_TYPE            XAI_TILE2D_GET_TYPE
+#define XAI_TILE4D_SET_TYPE            XAI_TILE2D_SET_TYPE
+
+#define XAI_TILE4D_GET_ELEMENT_TYPE    XAI_TILE2D_GET_ELEMENT_TYPE
+#define XAI_TILE4D_GET_ELEMENT_SIZE    XAI_TILE2D_GET_ELEMENT_SIZE
+#define XAI_TILE4D_IS_TILE             XAI_TILE2D_IS_TILE2D
+
+#define XAI_TILE4D_GET_FRAME_PTR(pTile4D)            ((pTile4D)->pFrame)
+#define XAI_TILE4D_SET_FRAME_PTR(pTile4D, ptrFrame)  (pTile4D)->pFrame = ((xai_pFrame4D) (ptrFrame))
+
+#define XAI_TILE4D_CHECK_STATUS_FLAGS_DMA_ONGOING          XAI_TILE2D_CHECK_STATUS_FLAGS_DMA_ONGOING
+#define XAI_TILE4D_CHECK_STATUS_FLAGS_EDGE_PADDING_NEEDED  XAI_TILE2D_CHECK_STATUS_FLAGS_EDGE_PADDING_NEEDED
+
+/***********************************
+*              Other Marcos
+***********************************/
+#define XAI_TILE4D_CHECK_VIRTUAL_FRAME   XAI_TILE2D_CHECK_VIRTUAL_FRAME
+#define XAI_FRAME4D_CHECK_VIRTUAL_FRAME  XAI_FRAME_CHECK_VIRTUAL_FRAME
+
+// Only Q8, 240 and 341 uses alignment = 127. for P6,P1 and Q7 like dsps alignment = 127 is not supported
+#define XAI_SETUP_TILE4D(type, pTile, pBuf, pFrame, bufSize, dim1Size, dim2Size, dim3Size, dim4Size, dim1Pitch, dim2Pitch, \
+                         dim3Pitch, dim1Edge1, dim1Edge2, dim2Edge1, dim2Edge2, dim3Edge1, dim3Edge2,                      \
+                         dim1Loc, dim2Loc, dim3Loc, dim4Loc, dataOrder, alignType)                                         \
+  {                                                                                                                        \
+    XAI_TILE4D_SET_TYPE(pTile, type);                                                                                      \
+    XAI_TILE4D_SET_FRAME_PTR(pTile, pFrame);                                                                               \
+    XAI_TILE4D_SET_BUFF_PTR(pTile, pBuf);                                                                                  \
+    XAI_TILE4D_SET_BUFF_SIZE(pTile, bufSize);                                                                              \
+    XAI_TILE4D_SET_DIM1(pTile, dim1Size);                                                                                  \
+    XAI_TILE4D_SET_DIM2(pTile, dim2Size);                                                                                  \
+    XAI_TILE4D_SET_DIM3(pTile, dim3Size);                                                                                  \
+    XAI_TILE4D_SET_DIM4(pTile, dim4Size);                                                                                  \
+    XAI_TILE4D_SET_DIM1_PITCH(pTile, dim1Pitch);                                                                           \
+    XAI_TILE4D_SET_DIM2_PITCH(pTile, dim2Pitch);                                                                           \
+    XAI_TILE4D_SET_DIM3_PITCH(pTile, dim3Pitch);                                                                           \
+    uint8_t *edgePtr  = (uint8_t *) pBuf, *dataPtr;                                                                        \
+    int32_t alignment = 127;                                                                                               \
+    if ((alignType == XAI_EDGE_ALIGNED_64) || (alignType == XAI_DATA_ALIGNED_64)) { alignment = 63; }                      \
+    if ((alignType == XAI_EDGE_ALIGNED_32) || (alignType == XAI_DATA_ALIGNED_32)) { alignment = 31; }                      \
+    if ((alignType == XAI_EDGE_ALIGNED_32) || (alignType == XAI_EDGE_ALIGNED_64) || (alignType == EDGE_ALIGNED_128))       \
+    {                                                                                                                      \
+      edgePtr = (uint8_t *) (((uintptr_t) (pBuf) + alignment) & (~alignment));                                             \
+    }                                                                                                                      \
+    XAI_TILE4D_SET_DATA_PTR(pTile, edgePtr + ((dim3Edge1) * (dim2Pitch) + (dim2Edge1) * (dim1Pitch) + (dim1Edge1))         \
+                            * XAI_TILE4D_GET_ELEMENT_SIZE(pTile));                                                         \
+    if ((alignType == XAI_DATA_ALIGNED_32) || (alignType == XAI_DATA_ALIGNED_64) || (alignType == DATA_ALIGNED_128))       \
+    {                                                                                                                      \
+      dataPtr = (uint8_t *) XAI_TILE4D_GET_DATA_PTR(pTile);                                                                \
+      dataPtr = (uint8_t *) (((uintptr_t) (dataPtr) + alignment) & (~alignment));                                          \
+      XAI_TILE4D_SET_DATA_PTR(pTile, dataPtr);                                                                             \
+    }                                                                                                                      \
+    XAI_TILE4D_SET_DIM1_EDGE1(pTile, dim1Edge1);                                                                           \
+    XAI_TILE4D_SET_DIM1_EDGE2(pTile, dim1Edge2);                                                                           \
+    XAI_TILE4D_SET_DIM2_EDGE1(pTile, dim2Edge1);                                                                           \
+    XAI_TILE4D_SET_DIM2_EDGE2(pTile, dim2Edge2);                                                                           \
+    XAI_TILE4D_SET_DIM3_EDGE1(pTile, dim3Edge1);                                                                           \
+    XAI_TILE4D_SET_DIM3_EDGE2(pTile, dim3Edge2);                                                                           \
+    XAI_TILE4D_SET_DIM1_COORD(pTile, dim1Loc);                                                                             \
+    XAI_TILE4D_SET_DIM2_COORD(pTile, dim2Loc);                                                                             \
+    XAI_TILE4D_SET_DIM3_COORD(pTile, dim3Loc);                                                                             \
+    XAI_TILE4D_SET_DIM4_COORD(pTile, dim4Loc);                                                                             \
+    XAI_TILE4D_SET_DATA_ORDER(pTile, dataOrder);                                                                           \
+  }
+
+#define XAI_SETUP_FRAME4D(pFrame, pFrameBuffer, bufSize, dim1Size, dim2Size, dim3Size, dim4Size, dim1Pitch, dim2Pitch, dim3Pitch,          \
+                          dim1Edge1, dim1Edge2, dim2Edge1, dim2Edge2, dim3Edge1, dim3Edge2, pixRes, pixPackFormat, paddingType, dataOrder) \
+  {                                                                                                                                        \
+    XAI_FRAME4D_SET_BUFF_PTR(pFrame, pFrameBuffer);                                                                                        \
+    XAI_FRAME4D_SET_BUFF_SIZE(pFrame, bufSize);                                                                                            \
+    XAI_FRAME4D_SET_DIM1(pFrame, dim1Size);                                                                                                \
+    XAI_FRAME4D_SET_DIM2(pFrame, dim2Size);                                                                                                \
+    XAI_FRAME4D_SET_DIM3(pFrame, dim3Size);                                                                                                \
+    XAI_FRAME4D_SET_DIM4(pFrame, dim4Size);                                                                                                \
+    XAI_FRAME4D_SET_DIM1_PITCH(pFrame, dim1Pitch);                                                                                         \
+    XAI_FRAME4D_SET_DIM2_PITCH(pFrame, dim2Pitch);                                                                                         \
+    XAI_FRAME4D_SET_DIM3_PITCH(pFrame, dim3Pitch);                                                                                         \
+    XAI_FRAME4D_SET_DATA_PTR(pFrame, pFrameBuffer + ((dim3Edge1) * (dim2Pitch) + (dim2Edge1) * (dim1Pitch) +                               \
+                                                     (dim1Edge1)) * pixRes);                                                               \
+    XAI_FRAME4D_SET_DIM1_EDGE1(pFrame, dim1Edge1);                                                                                         \
+    XAI_FRAME4D_SET_DIM1_EDGE2(pFrame, dim1Edge2);                                                                                         \
+    XAI_FRAME4D_SET_DIM2_EDGE1(pFrame, dim2Edge1);                                                                                         \
+    XAI_FRAME4D_SET_DIM2_EDGE2(pFrame, dim2Edge2);                                                                                         \
+    XAI_FRAME4D_SET_DIM3_EDGE1(pFrame, dim3Edge1);                                                                                         \
+    XAI_FRAME4D_SET_DIM3_EDGE2(pFrame, dim3Edge2);                                                                                         \
+    XAI_FRAME4D_SET_PIXEL_RES(pFrame, pixRes);                                                                                             \
+    XAI_FRAME4D_SET_PIXEL_FORMAT(pFrame, pixPackFormat);                                                                                   \
+    XAI_FRAME4D_SET_PADDING_TYPE(pFrame, paddingType);                                                                                     \
+    XAI_FRAME4D_SET_DATA_ORDER(pTile, dataOrder);                                                                                          \
+  }
+
+#define XAI_COPY_FRAME4D_TO_TILE4D(frame, tile)         {               \
+    XAI_TILE4D_SET_DIM1(tile, XAI_FRAME4D_GET_DIM1(frame));             \
+    XAI_TILE4D_SET_DIM1_PITCH(tile, XAI_FRAME4D_GET_DIM1_PITCH(frame)); \
+    XAI_TILE4D_SET_DIM1_EDGE1(tile, XAI_FRAME4D_GET_DIM1_EDGE1(frame)); \
+    XAI_TILE4D_SET_DIM1_EDGE2(tile, XAI_FRAME4D_GET_DIM1_EDGE2(frame)); \
+    XAI_TILE4D_SET_DIM2(tile, XAI_FRAME4D_GET_DIM2(frame));             \
+    XAI_TILE4D_SET_DIM2_PITCH(tile, XAI_FRAME4D_GET_DIM2_PITCH(frame)); \
+    XAI_TILE4D_SET_DIM2_EDGE1(tile, XAI_FRAME4D_GET_DIM2_EDGE1(frame)); \
+    XAI_TILE4D_SET_DIM2_EDGE2(tile, XAI_FRAME4D_GET_DIM2_EDGE2(frame)); \
+    XAI_TILE4D_SET_DIM3(tile, XAI_FRAME4D_GET_DIM3(frame));             \
+    XAI_TILE4D_SET_DIM3_PITCH(tile, XAI_FRAME4D_GET_DIM3_PITCH(frame)); \
+    XAI_TILE4D_SET_DIM3_EDGE1(tile, XAI_FRAME4D_GET_DIM3_EDGE1(frame)); \
+    XAI_TILE4D_SET_DIM3_EDGE2(tile, XAI_FRAME4D_GET_DIM3_EDGE2(frame)); \
+    XAI_TILE4D_SET_DIM4(tile, XAI_FRAME4D_GET_DIM4(frame));             \
+    XAI_TILE4D_SET_DATA_PTR(tile, XAI_FRAME4D_GET_DATA_PTR(frame));     \
+    XAI_TILE4D_SET_DATA_ORDER(tile, XAI_FRAME4D_GET_DATA_ORDER(frame)); \
+}
+
+#define XAI_COPY_FRAME4D_TO_FRAME4D(frameIn, frameOut)  {                      \
+    XAI_FRAME4D_SET_DIM1(frameOut, XAI_FRAME4D_GET_DIM1(frameIn));             \
+    XAI_FRAME4D_SET_DIM1_PITCH(frameOut, XAI_FRAME4D_GET_DIM1_PITCH(frameIn)); \
+    XAI_FRAME4D_SET_DIM1_EDGE1(frameOut, XAI_FRAME4D_GET_DIM1_EDGE1(frameIn)); \
+    XAI_FRAME4D_SET_DIM1_EDGE2(frameOut, XAI_FRAME4D_GET_DIM1_EDGE2(frameIn)); \
+    XAI_FRAME4D_SET_DIM2(frameOut, XAI_FRAME4D_GET_DIM2(frameIn));             \
+    XAI_FRAME4D_SET_DIM2_PITCH(frameOut, XAI_FRAME4D_GET_DIM2_PITCH(frameIn)); \
+    XAI_FRAME4D_SET_DIM2_EDGE1(frameOut, XAI_FRAME4D_GET_DIM2_EDGE1(frameIn)); \
+    XAI_FRAME4D_SET_DIM2_EDGE2(frameOut, XAI_FRAME4D_GET_DIM2_EDGE2(frameIn)); \
+    XAI_FRAME4D_SET_DIM3(frameOut, XAI_FRAME4D_GET_DIM3(frameIn));             \
+    XAI_FRAME4D_SET_DIM3_PITCH(frameOut, XAI_FRAME4D_GET_DIM3_PITCH(frameIn)); \
+    XAI_FRAME4D_SET_DIM3_EDGE1(frameOut, XAI_FRAME4D_GET_DIM3_EDGE1(frameIn)); \
+    XAI_FRAME4D_SET_DIM3_EDGE2(frameOut, XAI_FRAME4D_GET_DIM3_EDGE2(frameIn)); \
+    XAI_FRAME4D_SET_DIM4(frameOut, XAI_FRAME4D_GET_DIM4(frameIn));             \
+    XAI_FRAME4D_SET_DATA_PTR(frameOut, XAI_FRAME4D_GET_DATA_PTR(frameIn));     \
+    XAI_FRAME4D_SET_DATA_ORDER(frameOut, XAI_FRAME4D_GET_DATA_ORDER(frameIn)); \
+    XAI_FRAME4D_SET_PIXEL_RES(frameOut, XAI_FRAME4D_GET_PIXEL_RES(frameIn));   \
+}
+
+#define XAI_COPY_TILE4D_TO_TILE4D(tileIn, tileOut)      {                  \
+    XAI_TILE4D_SET_DIM1(tileOut, XAI_TILE4D_GET_DIM1(tileIn));             \
+    XAI_TILE4D_SET_DIM1_PITCH(tileOut, XAI_TILE4D_GET_DIM1_PITCH(tileIn)); \
+    XAI_TILE4D_SET_DIM1_EDGE1(tileOut, XAI_TILE4D_GET_DIM1_EDGE1(tileIn)); \
+    XAI_TILE4D_SET_DIM1_EDGE2(tileOut, XAI_TILE4D_GET_DIM1_EDGE2(tileIn)); \
+    XAI_TILE4D_SET_DIM2(tileOut, XAI_TILE4D_GET_DIM2(tileIn));             \
+    XAI_TILE4D_SET_DIM2_PITCH(tileOut, XAI_TILE4D_GET_DIM2_PITCH(tileIn)); \
+    XAI_TILE4D_SET_DIM2_EDGE1(tileOut, XAI_TILE4D_GET_DIM2_EDGE1(tileIn)); \
+    XAI_TILE4D_SET_DIM2_EDGE2(tileOut, XAI_TILE4D_GET_DIM2_EDGE2(tileIn)); \
+    XAI_TILE4D_SET_DIM3(tileOut, XAI_TILE4D_GET_DIM3(tileIn));             \
+    XAI_TILE4D_SET_DIM3_PITCH(tileOut, XAI_TILE4D_GET_DIM3_PITCH(tileIn)); \
+    XAI_TILE4D_SET_DIM3_EDGE1(tileOut, XAI_TILE4D_GET_DIM3_EDGE1(tileIn)); \
+    XAI_TILE4D_SET_DIM3_EDGE2(tileOut, XAI_TILE4D_GET_DIM3_EDGE2(tileIn)); \
+    XAI_TILE4D_SET_DIM4(tileOut, XAI_TILE4D_GET_DIM4(tileIn));             \
+    XAI_TILE4D_SET_DATA_PTR(tileOut, XAI_TILE4D_GET_DATA_PTR(tileIn));     \
+    XAI_TILE4D_SET_DATA_ORDER(tileOut, XAI_TILE4D_GET_DATA_ORDER(tileIn)); \
+}
+
+// Assumes 8 bit pixRes and Edge1 = Edge2
+#define XAI_TILE4D_UPDATE_EDGE_DIM1(pTile, newEdgeSize)                  \
+  {                                                                      \
+    uint16_t currEdgeSize = (uint16_t) XAI_TILE4D_GET_DIM1_EDGE1(pTile); \
+    uint32_t dim1Pitch    = (uint32_t) XAI_TILE4D_GET_DIM1_PITCH(pTile); \
+    uintptr_t dataU32     = (uintptr_t) XAI_TILE4D_GET_DATA_PTR(pTile);  \
+    dataU32 = dataU32 + newEdgeSize - currEdgeSize;                      \
+    XAI_TILE4D_SET_DATA_PTR(pTile, (void *) dataU32);                    \
+    XAI_TILE4D_SET_DIM1_EDGE1(pTile, newEdgeSize);                       \
+    XAI_TILE4D_SET_DIM1_EDGE2(pTile, newEdgeSize);                       \
+    XAI_TILE4D_SET_DIM1(pTile, dim1Pitch - 2 * newEdgeSize);             \
+  }
+
+// Assumes 8 bit pixRes and Edge1 = Edge2
+#define XAI_TILE4D_UPDATE_EDGE_DIM2(pTile, newEdgeSize)                      \
+  {                                                                          \
+    uint16_t currEdgeSize = (uint16_t) XAI_TILE4D_GET_DIM2_EDGE1(pTile);     \
+    uint32_t dim1Pitch    = (uint32_t) XAI_TILE4D_GET_DIM1_PITCH(pTile);     \
+    uint16_t dim2Size     = (uint16_t) XAI_TILE4D_GET_DIM2(pTile);           \
+    uintptr_t dataU32     = (uintptr_t) XAI_TILE4D_GET_DATA_PTR(pTile);      \
+    dataU32 = dataU32 + dim1Pitch * (newEdgeSize - currEdgeSize);            \
+    XAI_TILE4D_SET_DATA_PTR(pTile, (void *) dataU32);                        \
+    XAI_TILE4D_SET_DIM2_EDGE1(pTile, newEdgeSize);                           \
+    XAI_TILE4D_SET_DIM2_EDGE2(pTile, newEdgeSize);                           \
+    XAI_TILE4D_SET_DIM2(pTile, dim2Size + 2 * (currEdgeSize - newEdgeSize)); \
+  }
+
+// Assumes 8 bit pixRes and Edge1 = Edge2
+#define XAI_TILE4D_UPDATE_EDGE_DIM3(pTile, newEdgeSize)                      \
+  {                                                                          \
+    uint16_t currEdgeSize = (uint16_t) XAI_TILE4D_GET_DIM3_EDGE1(pTile);     \
+    uint32_t dim2Pitch    = (uint32_t) XAI_TILE4D_GET_DIM2_PITCH(pTile);     \
+    uint16_t dim3Size     = (uint16_t) XAI_TILE4D_GET_DIM3(pTile);           \
+    uintptr_t dataU32     = (uintptr_t) XAI_TILE4D_GET_DATA_PTR(pTile);      \
+    dataU32 = dataU32 + dim2Pitch * (newEdgeSize - currEdgeSize);            \
+    XAI_TILE4D_SET_DATA_PTR(pTile, (void *) dataU32);                        \
+    XAI_TILE4D_SET_DIM3_EDGE1(pTile, newEdgeSize);                           \
+    XAI_TILE4D_SET_DIM3_EDGE2(pTile, newEdgeSize);                           \
+    XAI_TILE4D_SET_DIM3(pTile, dim3Size + 2 * (currEdgeSize - newEdgeSize)); \
+  }
+
+#define XAI_TILE4D_UPDATE_DIMENSIONS(pTile, dim1Loc, dim2Loc, dim3Loc, dim4Loc, dim1Size, dim2Size, dim3Size, dim4Size, \
+                                     dim1Pitch, dim2Pitch, dim3Pitch)                                                   \
+  {                                                                                                                     \
+    XAI_TILE4D_SET_DIM1_COORD(pTile, dim1Loc);                                                                          \
+    XAI_TILE4D_SET_DIM2_COORD(pTile, dim2Loc);                                                                          \
+    XAI_TILE4D_SET_DIM3_COORD(pTile, dim3Loc);                                                                          \
+    XAI_TILE4D_SET_DIM4_COORD(pTile, dim4Loc);                                                                          \
+    XAI_TILE4D_SET_DIM1(pTile, dim1Size);                                                                               \
+    XAI_TILE4D_SET_DIM2(pTile, dim2Size);                                                                               \
+    XAI_TILE4D_SET_DIM3(pTile, dim3Size);                                                                               \
+    XAI_TILE4D_SET_DIM4(pTile, dim4Size);                                                                               \
+    XAI_TILE4D_SET_DIM1_PITCH(pTile, dim1Pitch);                                                                        \
+    XAI_TILE4D_SET_DIM2_PITCH(pTile, dim2Pitch);                                                                        \
+    XAI_TILE4D_SET_DIM3_PITCH(pTile, dim3Pitch);                                                                        \
+  }
+
+// 5D tile
+#define XAI_TILE5D_FIELDS \
+  uint32_t bufferSize;    \
+  int32_t dim1Size;       \
+  int32_t dim1Pitch;      \
+  uint16_t type;          \
+  int32_t dim2Size;       \
+  int32_t dim2Pitch;      \
+  int32_t dim3Size;       \
+  int32_t dim3Pitch;      \
+  int32_t dim4Size;       \
+  int32_t dim4Pitch;      \
+  int32_t dim5Size;       \
+  xai_cnn_data_order dataOrder;
+
+// 5D tile
+typedef struct xai_tile5DStruct
+{
+  void *pBuffer;
+  void *pData;
+  XAI_TILE5D_FIELDS
+} xai_tile5D, *xai_pTile5D;
+
+/*****************************************
+*                   5D Tile Access Macros
+*****************************************/
+#define XAI_TILE5D_GET_BUFF_PTR    XAI_TILE2D_GET_BUFF_PTR
+#define XAI_TILE5D_SET_BUFF_PTR    XAI_TILE2D_SET_BUFF_PTR
+
+#define XAI_TILE5D_GET_BUFF_SIZE   XAI_TILE2D_GET_BUFF_SIZE
+#define XAI_TILE5D_SET_BUFF_SIZE   XAI_TILE2D_SET_BUFF_SIZE
+
+#define XAI_TILE5D_GET_DATA_PTR    XAI_TILE2D_GET_DATA_PTR
+#define XAI_TILE5D_SET_DATA_PTR    XAI_TILE2D_SET_DATA_PTR
+
+#define XAI_TILE5D_GET_TYPE        XAI_TILE2D_GET_TYPE
+#define XAI_TILE5D_SET_TYPE        XAI_TILE2D_SET_TYPE
+
+#define XAI_TILE5D_GET_DIM1        XAI_TILE4D_GET_DIM1
+#define XAI_TILE5D_SET_DIM1        XAI_TILE4D_SET_DIM1
+#define XAI_TILE5D_GET_DIM1_PITCH  XAI_TILE4D_GET_DIM1_PITCH
+#define XAI_TILE5D_SET_DIM1_PITCH  XAI_TILE4D_SET_DIM1_PITCH
+#define XAI_TILE5D_GET_DIM2        XAI_TILE4D_GET_DIM2
+#define XAI_TILE5D_SET_DIM2        XAI_TILE4D_SET_DIM2
+#define XAI_TILE5D_GET_DIM2_PITCH  XAI_TILE4D_GET_DIM2_PITCH
+#define XAI_TILE5D_SET_DIM2_PITCH  XAI_TILE4D_SET_DIM2_PITCH
+#define XAI_TILE5D_GET_DIM3        XAI_TILE4D_GET_DIM3
+#define XAI_TILE5D_SET_DIM3        XAI_TILE4D_SET_DIM3
+#define XAI_TILE5D_GET_DIM3_PITCH  XAI_TILE4D_GET_DIM3_PITCH
+#define XAI_TILE5D_SET_DIM3_PITCH  XAI_TILE4D_SET_DIM3_PITCH
+#define XAI_TILE5D_GET_DIM4        XAI_TILE4D_GET_DIM4
+#define XAI_TILE5D_SET_DIM4        XAI_TILE4D_SET_DIM4
+#define XAI_TILE5D_GET_DIM4_PITCH(x)     ((x)->dim4Pitch)
+#define XAI_TILE5D_SET_DIM4_PITCH(x, v)  ((x)->dim4Pitch = (v))
+#define XAI_TILE5D_GET_DIM5(x)           ((x)->dim5Size)
+#define XAI_TILE5D_SET_DIM5(x, v)        ((x)->dim5Size = (v))
+#define XAI_TILE5D_GET_DATA_ORDER(x)     ((x)->dataOrder)
+#define XAI_TILE5D_SET_DATA_ORDER(x, v)  ((x)->dataOrder = (v))
+#define XAI_TILE5D_GET_ELEMENT_TYPE  XAI_TILE2D_GET_ELEMENT_TYPE
+#define XAI_TILE5D_GET_ELEMENT_SIZE  XAI_TILE2D_GET_ELEMENT_SIZE
+
+#if USE_64BIT_COEFF
+#define xai_pArray_coeff   xai_pArray_coeff_64
+#define xai_pTile3D_coeff  xai_pTile3D_64
+#define xai_pTile4D_coeff  xai_pTile4D_64
+#else
+#define xai_pArray_coeff   xai_pArray_coeff_32
+#define xai_pTile3D_coeff  xai_pTile3D
+#define xai_pTile4D_coeff  xai_pTile4D
+#endif // #if USE_64BIT_COEFF
+#endif // #ifndef __XAI_TILE_MANAGER_H__
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_cast.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_cast.c
new file mode 100644
index 00000000000..92fd72ebb91
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_cast.c
@@ -0,0 +1,1622 @@
+/*
+ * Copyright (c) 2025 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn_common.h"
+#include <string.h>
+
+/* ----------------------------------------------------------------------------------------------------------------------- */
+#if XCHAL_HAVE_VISION // Optimized code is called for Vision DSPs
+/* ----------------------------------------------------------------------------------------------------------------------- */
+#include "cnn_cast_scalar.h"
+
+#ifdef IN_DATA_TYPE
+#undef IN_DATA_TYPE
+#endif
+#ifdef OUT_DATA_TYPE
+#undef OUT_DATA_TYPE
+#endif
+
+#define IN_DATA_TYPE   UNSIGNED8BIT
+#define OUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#ifdef IVP_LAVN_4X64U_XP
+#define OUT_DATA_TYPE  UNSIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif // #ifdef IVP_LAVN_4X64U_XP
+
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+#undef IN_DATA_TYPE
+
+
+#define IN_DATA_TYPE   SIGNED8BIT
+#define OUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#ifdef IVP_LAVN_4X64U_XP
+#define OUT_DATA_TYPE  UNSIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+#undef IN_DATA_TYPE
+
+
+#define IN_DATA_TYPE   UNSIGNED16BIT
+#define OUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#ifdef IVP_LAVN_4X64U_XP
+#define OUT_DATA_TYPE  UNSIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+#undef IN_DATA_TYPE
+
+
+#define IN_DATA_TYPE   SIGNED16BIT
+#define OUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#ifdef IVP_LAVN_4X64U_XP
+#define OUT_DATA_TYPE  UNSIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+#undef IN_DATA_TYPE
+
+
+#define IN_DATA_TYPE   UNSIGNED32BIT
+#define OUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#ifdef IVP_LAVN_4X64U_XP
+#define OUT_DATA_TYPE  UNSIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+#undef IN_DATA_TYPE
+
+
+#define IN_DATA_TYPE   SIGNED32BIT
+#define OUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#ifdef IVP_LAVN_4X64U_XP
+#define OUT_DATA_TYPE  UNSIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+#undef IN_DATA_TYPE
+
+#ifdef IVP_LAVN_4X64U_XP
+#define IN_DATA_TYPE   UNSIGNED64BIT
+#define OUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+#undef IN_DATA_TYPE
+
+
+#define IN_DATA_TYPE   SIGNED64BIT
+#define OUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+#undef IN_DATA_TYPE
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#define IN_DATA_TYPE   FLOAT16BIT
+#define OUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#ifdef IVP_LAVN_4X64U_XP
+#define OUT_DATA_TYPE  UNSIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+#undef IN_DATA_TYPE
+#endif //#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+
+
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#define IN_DATA_TYPE   FLOAT32BIT
+#define OUT_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED8BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED32BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#ifdef IVP_LAVN_4X64U_XP
+#define OUT_DATA_TYPE  UNSIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+
+#define OUT_DATA_TYPE  SIGNED64BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#define OUT_DATA_TYPE  FLOAT16BIT
+#include "cnn_cast.h"
+#undef OUT_DATA_TYPE
+#endif
+#undef IN_DATA_TYPE
+#endif //#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+
+/**************************** xaiCast3D *****************************************/
+/* Description  : General API for data casting                                  */
+/* Inputs       : inTile                                                        */
+/* Outputs      : XAI Error Code                                                */
+/* InOuts       : outTile                                                       */
+/********************************************************************************/
+XAI_ERR_TYPE xaiCast3D(const xai_pTile3D inTile,
+                       xai_pTile3D outTile)
+{
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(inTile);
+    XAI_CHECK_POINTER(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile),
+                    XAI_ERR_BADARG, "\nInput Data Order %d and Output Data Order %d are not same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8))
+  {
+    switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile))
+    {
+      case XAI_S8:
+        xaiCast3DFromU8ToS8(inTile, outTile);
+        break;
+      case XAI_U16:
+        xaiCast3DFromU8ToU16(inTile, outTile);
+        break;
+      case XAI_S16:
+        xaiCast3DFromU8ToS16(inTile, outTile);
+        break;
+      case XAI_U32:
+        xaiCast3DFromU8ToU32(inTile, outTile);
+        break;
+      case XAI_S32:
+        xaiCast3DFromU8ToS32(inTile, outTile);
+        break;
+      case XAI_U64:
+#ifdef IVP_LAVN_4X64U_XP
+        xaiCast3DFromU8ToU64(inTile, outTile);
+#else
+        xaiCast3DScalar_I64(inTile, outTile);
+#endif
+        break;
+      case XAI_S64:
+#ifdef IVP_LAVN_4X64U_XP
+        xaiCast3DFromU8ToS64(inTile, outTile);
+#else
+        xaiCast3DScalar_I64(inTile, outTile);
+#endif
+        break;
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+      case XAI_F16:
+        xaiCast3DFromU8ToF16(inTile, outTile);
+        break;
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+      case XAI_F32:
+        xaiCast3DFromU8ToF32(inTile, outTile);
+        break;
+#endif
+      default:
+        return(XAI_ERR_DATATYPE);
+        break;
+    }
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8))
+  {
+    switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile))
+    {
+      case XAI_U8:
+        xaiCast3DFromS8ToU8(inTile, outTile);
+        break;
+      case XAI_U16:
+        xaiCast3DFromS8ToU16(inTile, outTile);
+      case XAI_S16:
+        xaiCast3DFromS8ToS16(inTile, outTile);
+        break;
+      case XAI_U32:
+        xaiCast3DFromS8ToU32(inTile, outTile);
+        break;
+      case XAI_S32:
+        xaiCast3DFromS8ToS32(inTile, outTile);
+        break;
+      case XAI_U64:
+#ifdef IVP_LAVN_4X64U_XP
+        xaiCast3DFromS8ToU64(inTile, outTile);
+#else
+        xaiCast3DScalar_I64(inTile, outTile);
+#endif
+        break;
+      case XAI_S64:
+#ifdef IVP_LAVN_4X64U_XP
+        xaiCast3DFromS8ToS64(inTile, outTile);
+#else
+        xaiCast3DScalar_I64(inTile, outTile);
+#endif
+        break;
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+      case XAI_F16:
+        xaiCast3DFromS8ToF16(inTile, outTile);
+        break;
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+      case XAI_F32:
+        xaiCast3DFromS8ToF32(inTile, outTile);
+        break;
+#endif
+      default:
+        return(XAI_ERR_DATATYPE);
+        break;
+    }
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U16))
+  {
+    switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile))
+    {
+      case XAI_U8:
+        xaiCast3DFromU16ToU8(inTile, outTile);
+        break;
+      case XAI_S8:
+        xaiCast3DFromU16ToS8(inTile, outTile);
+        break;
+      case XAI_S16:
+        xaiCast3DFromU16ToS16(inTile, outTile);
+        break;
+      case XAI_U32:
+        xaiCast3DFromU16ToU32(inTile, outTile);
+        break;
+      case XAI_S32:
+        xaiCast3DFromU16ToS32(inTile, outTile);
+        break;
+      case XAI_U64:
+#ifdef IVP_LAVN_4X64U_XP
+        xaiCast3DFromU16ToU64(inTile, outTile);
+#else
+        xaiCast3DScalar_I64(inTile, outTile);
+#endif
+        break;
+      case XAI_S64:
+#ifdef IVP_LAVN_4X64U_XP
+        xaiCast3DFromU16ToS64(inTile, outTile);
+#else
+        xaiCast3DScalar_I64(inTile, outTile);
+#endif
+        break;
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+      case XAI_F16:
+        xaiCast3DFromU16ToF16(inTile, outTile);
+        break;
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+      case XAI_F32:
+        xaiCast3DFromU16ToF32(inTile, outTile);
+        break;
+#endif
+      default:
+        return(XAI_ERR_DATATYPE);
+        break;
+    }
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16))
+  {
+    switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile))
+    {
+      case XAI_U8:
+        xaiCast3DFromS16ToU8(inTile, outTile);
+        break;
+      case XAI_S8:
+        xaiCast3DFromS16ToS8(inTile, outTile);
+        break;
+      case XAI_U16:
+        xaiCast3DFromS16ToU16(inTile, outTile);
+        break;
+      case XAI_U32:
+        xaiCast3DFromS16ToU32(inTile, outTile);
+        break;
+      case XAI_S32:
+        xaiCast3DFromS16ToS32(inTile, outTile);
+        break;
+      case XAI_U64:
+#ifdef IVP_LAVN_4X64U_XP
+        xaiCast3DFromS16ToU64(inTile, outTile);
+#else
+        xaiCast3DScalar_I64(inTile, outTile);
+#endif
+        break;
+      case XAI_S64:
+#ifdef IVP_LAVN_4X64U_XP
+        xaiCast3DFromS16ToS64(inTile, outTile);
+#else
+        xaiCast3DScalar_I64(inTile, outTile);
+#endif
+        break;
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+      case XAI_F16:
+        xaiCast3DFromS16ToF16(inTile, outTile);
+        break;
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+      case XAI_F32:
+        xaiCast3DFromS16ToF32(inTile, outTile);
+        break;
+#endif
+      default:
+        return(XAI_ERR_DATATYPE);
+        break;
+    }
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U32))
+  {
+    switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile))
+    {
+      case XAI_U8:
+        xaiCast3DFromU32ToU8(inTile, outTile);
+        break;
+      case XAI_S8:
+        xaiCast3DFromU32ToS8(inTile, outTile);
+        break;
+      case XAI_U16:
+        xaiCast3DFromU32ToU16(inTile, outTile);
+        break;
+      case XAI_S16:
+        xaiCast3DFromU32ToS16(inTile, outTile);
+        break;
+      case XAI_S32:
+        xaiCast3DFromU32ToS32(inTile, outTile);
+        break;
+      case XAI_U64:
+#ifdef IVP_LAVN_4X64U_XP
+        xaiCast3DFromU32ToU64(inTile, outTile);
+#else
+        xaiCast3DScalar_I64(inTile, outTile);
+#endif
+        break;
+      case XAI_S64:
+#ifdef IVP_LAVN_4X64U_XP
+        xaiCast3DFromU32ToS64(inTile, outTile);
+#else
+        xaiCast3DScalar_I64(inTile, outTile);
+#endif
+        break;
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+      case XAI_F16:
+        xaiCast3DFromU32ToF16(inTile, outTile);
+        break;
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+      case XAI_F32:
+        xaiCast3DFromU32ToF32(inTile, outTile);
+        break;
+#endif
+      default:
+        return(XAI_ERR_DATATYPE);
+        break;
+    }
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S32))
+  {
+    switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile))
+    {
+      case XAI_U8:
+        xaiCast3DFromS32ToU8(inTile, outTile);
+        break;
+      case XAI_S8:
+        xaiCast3DFromS32ToS8(inTile, outTile);
+        break;
+      case XAI_U16:
+        xaiCast3DFromS32ToU16(inTile, outTile);
+        break;
+      case XAI_S16:
+        xaiCast3DFromS32ToS16(inTile, outTile);
+        break;
+      case XAI_U32:
+        xaiCast3DFromS32ToU32(inTile, outTile);
+        break;
+      case XAI_U64:
+#ifdef IVP_LAVN_4X64U_XP
+        xaiCast3DFromS32ToU64(inTile, outTile);
+#else
+        xaiCast3DScalar_I64(inTile, outTile);
+#endif
+        break;
+      case XAI_S64:
+#ifdef IVP_LAVN_4X64U_XP
+        xaiCast3DFromS32ToS64(inTile, outTile);
+#else
+        xaiCast3DScalar_I64(inTile, outTile);
+#endif
+        break;
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+      case XAI_F16:
+        xaiCast3DFromS32ToF16(inTile, outTile);
+        break;
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+      case XAI_F32:
+        xaiCast3DFromS32ToF32(inTile, outTile);
+        break;
+#endif
+      default:
+        return(XAI_ERR_DATATYPE);
+        break;
+    }
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U64))
+  {
+#ifdef IVP_LAVN_4X64U_XP
+    switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile))
+    {
+      case XAI_U8:
+        xaiCast3DFromU64ToU8(inTile, outTile);
+        break;
+      case XAI_S8:
+        xaiCast3DFromU64ToS8(inTile, outTile);
+        break;
+      case XAI_U16:
+        xaiCast3DFromU64ToU16(inTile, outTile);
+        break;
+      case XAI_S16:
+        xaiCast3DFromU64ToS16(inTile, outTile);
+        break;
+      case XAI_U32:
+        xaiCast3DFromU64ToU32(inTile, outTile);
+        break;
+      case XAI_S32:
+        xaiCast3DFromU64ToS32(inTile, outTile);
+        break;
+      case XAI_S64:
+        xaiCast3DFromU64ToS64(inTile, outTile);
+        break;
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+      case XAI_F16:
+        xaiCast3DFromU64ToF16(inTile, outTile);
+        break;
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+      case XAI_F32:
+        xaiCast3DFromU64ToF32(inTile, outTile);
+        break;
+#endif
+      default:
+        return(XAI_ERR_DATATYPE);
+        break;
+    }
+#else //#ifdef IVP_LAVN_4X64U_XP
+    if (!XAI_TILE3D_CHECK_TYPE(outTile, XAI_U64))
+    {
+      xaiCast3DScalar_I64(inTile, outTile);
+    }
+    else
+    {
+      return(XAI_ERR_DATATYPE);
+    }
+#endif  //#ifdef IVP_LAVN_4X64U_XP
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S64))
+  {
+#ifdef IVP_LAVN_4X64U_XP
+    switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile))
+    {
+      case XAI_U8:
+        xaiCast3DFromS64ToU8(inTile, outTile);
+        break;
+      case XAI_S8:
+        xaiCast3DFromS64ToS8(inTile, outTile);
+        break;
+      case XAI_U16:
+        xaiCast3DFromS64ToU16(inTile, outTile);
+        break;
+      case XAI_S16:
+        xaiCast3DFromS64ToS16(inTile, outTile);
+        break;
+      case XAI_U32:
+        xaiCast3DFromS64ToU32(inTile, outTile);
+        break;
+      case XAI_S32:
+        xaiCast3DFromS64ToS32(inTile, outTile);
+        break;
+      case XAI_U64:
+        xaiCast3DFromS64ToU64(inTile, outTile);
+        break;
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+      case XAI_F16:
+        xaiCast3DFromS64ToF16(inTile, outTile);
+        break;
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+      case XAI_F32:
+        xaiCast3DFromS64ToF32(inTile, outTile);
+        break;
+#endif
+      default:
+        return(XAI_ERR_DATATYPE);
+        break;
+    }
+#else  //#ifdef IVP_LAVN_4X64U_XP
+    if (!XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64))
+    {
+      xaiCast3DScalar_I64(inTile, outTile);
+    }
+    else
+    {
+      return(XAI_ERR_DATATYPE);
+    }
+#endif  //#ifdef IVP_LAVN_4X64U_XP
+  }
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+  else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_F16))
+  {
+    switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile))
+    {
+      case XAI_U8:
+        xaiCast3DFromF16ToU8(inTile, outTile);
+        break;
+      case XAI_S8:
+        xaiCast3DFromF16ToS8(inTile, outTile);
+        break;
+      case XAI_U16:
+        xaiCast3DFromF16ToU16(inTile, outTile);
+        break;
+      case XAI_S16:
+        xaiCast3DFromF16ToS16(inTile, outTile);
+        break;
+      case XAI_U32:
+        xaiCast3DFromF16ToU32(inTile, outTile);
+        break;
+      case XAI_S32:
+        xaiCast3DFromF16ToS32(inTile, outTile);
+        break;
+      case XAI_U64:
+#ifdef IVP_LAVN_4X64U_XP
+        xaiCast3DFromF16ToU64(inTile, outTile);
+#else
+        xaiCast3DScalar_I64(inTile, outTile);
+#endif
+        break;
+      case XAI_S64:
+#ifdef IVP_LAVN_4X64U_XP
+        xaiCast3DFromF16ToS64(inTile, outTile);
+#else
+        xaiCast3DScalar_I64(inTile, outTile);
+#endif
+        break;
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+      case XAI_F32:
+        xaiCast3DFromF16ToF32(inTile, outTile);
+        break;
+#endif
+      default:
+        return(XAI_ERR_DATATYPE);
+        break;
+    }
+  }
+#endif
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+  else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_F32))
+  {
+    switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile))
+    {
+      case XAI_U8:
+        xaiCast3DFromF32ToU8(inTile, outTile);
+        break;
+      case XAI_S8:
+        xaiCast3DFromF32ToS8(inTile, outTile);
+        break;
+      case XAI_U16:
+        xaiCast3DFromF32ToU16(inTile, outTile);
+        break;
+      case XAI_S16:
+        xaiCast3DFromF32ToS16(inTile, outTile);
+        break;
+      case XAI_U32:
+        xaiCast3DFromF32ToU32(inTile, outTile);
+        break;
+      case XAI_S32:
+        xaiCast3DFromF32ToS32(inTile, outTile);
+        break;
+      case XAI_U64:
+#ifdef IVP_LAVN_4X64U_XP
+        xaiCast3DFromF32ToU64(inTile, outTile);
+#else
+        xaiCast3DScalar_I64(inTile, outTile);
+#endif
+        break;
+      case XAI_S64:
+#ifdef IVP_LAVN_4X64U_XP
+        xaiCast3DFromF32ToS64(inTile, outTile);
+#else
+        xaiCast3DScalar_I64(inTile, outTile);
+#endif
+        break;
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+      case XAI_F16:
+        xaiCast3DFromF32ToF16(inTile, outTile);
+        break;
+#endif
+      default:
+        return(XAI_ERR_DATATYPE);
+        break;
+    }
+  }
+#endif
+  else
+  {
+    return(XAI_ERR_DATATYPE);
+  }
+  return(XAI_ERROR_STATUS());
+}
+
+/* ----------------------------------------------------------------------------------------------------------------------- */
+#else // Call the reference code only for MathX DSPs for now
+/* ----------------------------------------------------------------------------------------------------------------------- */
+#if ((XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+static float fp32_from_bits1(uint32_t w)
+{
+  union
+  {
+    uint32_t as_bits;
+    float    as_value;
+  } fp32 = { w };
+  return(fp32.as_value);
+}
+
+static uint32_t fp32_to_bits1(float f)
+{
+  union
+  {
+    float    as_value;
+    uint32_t as_bits;
+  } fp32 = { f };
+  return(fp32.as_bits);
+}
+
+static float convert_fp16_to_fp32(uint16_t h)
+{
+  const uint32_t w                   = (uint32_t) h << 16;
+  const uint32_t sign                = w & UINT32_C(0x80000000);
+  const uint32_t two_w               = w + w;
+  const uint32_t exp_offset          = UINT32_C(0xE0) << 23;
+  const float exp_scale              = fp32_from_bits1(UINT32_C(0x7800000));
+  const float normalized_value       = fp32_from_bits1((two_w >> 4) + exp_offset) * exp_scale;
+  const uint32_t magic_mask          = UINT32_C(126) << 23;
+  const float magic_bias             = 0.5f;
+  const float denormalized_value     = fp32_from_bits1((two_w >> 17) | magic_mask) - magic_bias;
+  const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+  const uint32_t result              = sign |
+                                       (two_w < denormalized_cutoff ? fp32_to_bits1(denormalized_value) : fp32_to_bits1(normalized_value));
+  return(fp32_from_bits1(result));
+}
+
+static uint16_t convert_fp32_to_fp16(float f)
+{
+  const float scale_to_inf  = fp32_from_bits1(UINT32_C(0x77800000));
+  const float scale_to_zero = fp32_from_bits1(UINT32_C(0x08800000));
+  float base                = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+  const uint32_t w      = (uint32_t) fp32_to_bits1(f);
+  const uint32_t shl1_w = w + w;
+  const uint32_t sign   = w & UINT32_C(0x80000000);
+  uint32_t bias         = shl1_w & UINT32_C(0xFF000000);
+  if (bias < UINT32_C(0x71000000))
+  {
+    bias = UINT32_C(0x71000000);
+  }
+
+  base = fp32_from_bits1((bias >> 1) + UINT32_C(0x07800000)) + base;
+  const uint32_t bits          = fp32_to_bits1(base);
+  const uint32_t exp_bits      = (bits >> 13) & UINT32_C(0x00007C00);
+  const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+  const uint32_t nonsign       = exp_bits + mantissa_bits;
+  return((sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign));
+}
+
+/**************************** xaiCast3D *****************************************/
+/* Description  : General API for data casting                                  */
+/* Inputs       : inTile                                                        */
+/* Outputs      : XAI Error Code                                                */
+/* InOuts       : outTile                                                       */
+/********************************************************************************/
+XAI_ERR_TYPE xaiCast3D(const xai_pTile3D inTile,
+                       xai_pTile3D outTile)
+{
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_POINTER(inTile);
+    XAI_CHECK_POINTER(outTile);
+    XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile),
+                    XAI_ERR_BADARG, "\nInput Data Order %d and Output Data Order %d are not same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile));
+  }
+
+  /* Get tile parameters */
+  const int32_t dim1Size  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size  = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inPitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inPitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Input data pointers */
+  uint8_t *pIn_8bU   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pIn_8b     = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  uint16_t *pIn_16bU = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int16_t *pIn_16b   = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  uint32_t *pIn_32bU = (uint32_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int32_t *pIn_32b   = (int32_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  uint64_t *pIn_64bU = (uint64_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int64_t *pIn_64b   = (int64_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)
+  xb_f16 *pIn_f16b = (xb_f16  *) XAI_TILE3D_GET_DATA_PTR(inTile);
+#endif
+  float *pIn_f32b = (float  *) XAI_TILE3D_GET_DATA_PTR(inTile);
+
+  /* Output data pointers */
+  uint8_t *pOut_8bU   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pOut_8b     = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  uint16_t *pOut_16bU = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int16_t *pOut_16b   = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  uint32_t *pOut_32bU = (uint32_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t *pOut_32b   = (int32_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  uint64_t *pOut_64bU = (uint64_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int64_t *pOut_64b   = (int64_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)
+  xb_f16 *pOut_f16b = (xb_f16  *) XAI_TILE3D_GET_DATA_PTR(outTile);
+#endif
+  float *pOut_f32b = (float  *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  uint16_t temp;
+  int32_t x, y, z;
+
+  for (z = 0; z < dim3Size; z++) /* along 3rd dimension */
+  {
+    for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+    {
+      for (x = 0; x < dim1Size; x++) /* along 1st dimension */
+      {
+        // Conversions to U64
+        if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U64))
+        {
+          switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile))
+          {
+            // U8 -> U64
+            case XAI_U8:
+              pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S8 -> U64
+            case XAI_S8:
+              pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_8b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U16 -> U64
+            case XAI_U16:
+              pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S16 -> U64
+            case XAI_S16:
+              pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_16b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U32 -> U64
+            case XAI_U32:
+              pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S32 -> U64
+            case XAI_S32:
+              pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> U64
+            case XAI_S64:
+              pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_64b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // F32 -> U64
+            case XAI_F32:
+              pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)
+            // F16 -> U64
+            case XAI_F16:
+              memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2);  // Strict Aliasing Rule, TENX-63685
+              pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) convert_fp16_to_fp32(temp);
+              break;
+#endif
+            default:
+              return(XAI_ERR_NO_VARIANT);
+              break;
+          }
+        }
+        // Conversions to S64
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64))
+        {
+          switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile))
+          {
+            // U8 -> S64
+            case XAI_U8:
+              pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S8 -> S64
+            case XAI_S8:
+              pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_8b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U16 -> S64
+            case XAI_U16:
+              pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S16 -> S64
+            case XAI_S16:
+              pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_16b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U32 -> S64
+            case XAI_U32:
+              pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S32 -> S64
+            case XAI_S32:
+              pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> S64
+            case XAI_U64:
+              pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // F32 -> S64
+            case XAI_F32:
+              pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)
+            // F16 -> S64
+            case XAI_F16:
+              memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2);
+              pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) convert_fp16_to_fp32(temp);
+              break;
+#endif
+            default:
+              return(XAI_ERR_NO_VARIANT);
+              break;
+          }
+        }
+        // Conversions to U32
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U32))
+        {
+          switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile))
+          {
+            // U8 -> U32
+            case XAI_U8:
+              pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S8 -> U32
+            case XAI_S8:
+              pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_8b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U16 -> U32
+            case XAI_U16:
+              pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S16 -> U32
+            case XAI_S16:
+              pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_16b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S32 -> U32
+            case XAI_S32:
+              pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U64 -> U32
+            case XAI_U64:
+              pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> U32
+            case XAI_S64:
+              pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_64b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // F32 -> U32
+            case XAI_F32:
+              pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)
+            // F16 -> U32
+            case XAI_F16:
+              memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2);
+              pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) convert_fp16_to_fp32(temp);
+              break;
+#endif
+            default:
+              return(XAI_ERR_NO_VARIANT);
+              break;
+          }
+        }
+        // Conversions to S32
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32))
+        {
+          switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile))
+          {
+            // U8 -> S32
+            case XAI_U8:
+              pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S8 -> S32
+            case XAI_S8:
+              pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_8b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U16 -> S32
+            case XAI_U16:
+              pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S16 -> S32
+            case XAI_S16:
+              pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_16b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U32 -> S32
+            case XAI_U32:
+              pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U64 -> S32
+            case XAI_U64:
+              pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> S32
+            case XAI_S64:
+              pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_64b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // F32 -> S32
+            case XAI_F32:
+              pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)
+            // F16 -> S32
+            case XAI_F16:
+              memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2);
+              pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) convert_fp16_to_fp32(temp);
+              break;
+#endif
+            default:
+              return(XAI_ERR_NO_VARIANT);
+              break;
+          }
+        }
+        // Conversions to U16
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16))
+        {
+          switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile))
+          {
+            // U8 -> U16
+            case XAI_U8:
+              pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S8 -> U16
+            case XAI_S8:
+              pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_8b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S16 -> U16
+            case XAI_S16:
+              pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_16b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U32 -> U16
+            case XAI_U32:
+              pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S32 -> U16
+            case XAI_S32:
+              pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U64 -> U16
+            case XAI_U64:
+              pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> U16
+            case XAI_S64:
+              pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_64b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // F32 -> U16
+            case XAI_F32:
+              pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)
+            // F32 -> U16
+            case XAI_F16:
+              memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2);
+              pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) convert_fp16_to_fp32(temp);
+              break;
+#endif
+            default:
+              return(XAI_ERR_NO_VARIANT);
+              break;
+          }
+        }
+        // Conversions to S16
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16))
+        {
+          switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile))
+          {
+            // U8 -> S16
+            case XAI_U8:
+              pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S8 -> S16
+            case XAI_S8:
+              pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_8b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U16 -> S16
+            case XAI_U16:
+              pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U32 -> S16
+            case XAI_U32:
+              pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S32 -> S16
+            case XAI_S32:
+              pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U64 -> S16
+            case XAI_U64:
+              pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> S16
+            case XAI_S64:
+              pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_64b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // F32 -> S16
+            case XAI_F32:
+              pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)
+            // F32 -> S16
+            case XAI_F16:
+              memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2);
+              pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) convert_fp16_to_fp32(temp);
+              break;
+#endif
+            default:
+              return(XAI_ERR_NO_VARIANT);
+              break;
+          }
+        }
+        // Conversions to U8
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U8))
+        {
+          switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile))
+          {
+            // S8 -> U8
+            case XAI_S8:
+              pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_8b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U16 -> U8
+            case XAI_U16:
+              pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S16 -> U8
+            case XAI_S16:
+              pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_16b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U32 -> U8
+            case XAI_U32:
+              pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S32 -> U8
+            case XAI_S32:
+              pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U64 -> U8
+            case XAI_U64:
+              pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> U8
+            case XAI_S64:
+              pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_64b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // F32 -> U8
+            case XAI_F32:
+              pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)
+            // F32 -> U8
+            case XAI_F16:
+              memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2);
+              pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) convert_fp16_to_fp32(temp);
+              break;
+#endif
+            default:
+              return(XAI_ERR_NO_VARIANT);
+              break;
+          }
+        }
+        // Conversions to S8
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8))
+        {
+          switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile))
+          {
+            // U8 -> S8
+            case XAI_U8:
+              pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U16 -> S8
+            case XAI_U16:
+              pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S16 -> S8
+            case XAI_S16:
+              pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_16b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U32 -> S8
+            case XAI_U32:
+              pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S32 -> S8
+            case XAI_S32:
+              pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U64 -> S8
+            case XAI_U64:
+              pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> S8
+            case XAI_S64:
+              pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_64b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // F32 -> S8
+            case XAI_F32:
+              pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)
+            // F32 -> S8
+            case XAI_F16:
+              memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2);
+              pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) convert_fp16_to_fp32(temp);
+              break;
+#endif
+            default:
+              return(XAI_ERR_NO_VARIANT);
+              break;
+          }
+        }
+        // Conversions to F32
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_F32))
+        {
+          switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile))
+          {
+            // U8 -> F32
+            case XAI_U8:
+              pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_8bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S8 -> F32
+            case XAI_S8:
+              pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_8b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U16 -> F32
+            case XAI_U16:
+              pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_16bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S16 -> F32
+            case XAI_S16:
+              pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_16b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U32 -> F32
+            case XAI_U32:
+              pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_32bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S32 -> F32
+            case XAI_S32:
+              pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U64 -> F32
+            case XAI_U64:
+              pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_64bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> F32
+            case XAI_S64:
+              pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_64b[z * inPitch2 + y * inPitch1 + x];
+              break;
+#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)
+            // F16 -> F32
+            case XAI_F16:
+              memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2);
+              pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) convert_fp16_to_fp32(temp);
+              break;
+#endif
+            default:
+              return(XAI_ERR_NO_VARIANT);
+              break;
+          }
+        }
+        // Conversions to F16
+#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_F16))
+        {
+          switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile))
+          {
+            // U8 -> F16
+            case XAI_U8:
+              temp = convert_fp32_to_fp16((float) pIn_8bU[z * inPitch2 + y * inPitch1 + x]);
+              memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2);             // Strict Aliasing Rule, TENX-63685
+              break;
+            // S8 -> F16
+            case XAI_S8:
+              temp = convert_fp32_to_fp16((float) pIn_8b[z * inPitch2 + y * inPitch1 + x]);
+              memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2);
+              break;
+            // U16 -> F16
+            case XAI_U16:
+              temp = convert_fp32_to_fp16((float) pIn_16bU[z * inPitch2 + y * inPitch1 + x]);
+              memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2);
+              break;
+            // S16 -> F16
+            case XAI_S16:
+              temp = convert_fp32_to_fp16((float) pIn_16b[z * inPitch2 + y * inPitch1 + x]);
+              memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2);
+              break;
+            // U32 -> F16
+            case XAI_U32:
+              temp = convert_fp32_to_fp16((float) pIn_32bU[z * inPitch2 + y * inPitch1 + x]);
+              memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2);
+              break;
+            // S32 -> F16
+            case XAI_S32:
+              temp = convert_fp32_to_fp16((float) pIn_32b[z * inPitch2 + y * inPitch1 + x]);
+              memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2);
+              break;
+            // U64 -> F16
+            case XAI_U64:
+              temp = convert_fp32_to_fp16((float) pIn_64bU[z * inPitch2 + y * inPitch1 + x]);
+              memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2);
+              break;
+            // S64 -> F16
+            case XAI_S64:
+              temp = convert_fp32_to_fp16((float) pIn_64b[z * inPitch2 + y * inPitch1 + x]);
+              memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2);
+              break;
+            // F32 -> F16
+            case XAI_F32:
+              temp = convert_fp32_to_fp16((float) pIn_f32b[z * inPitch2 + y * inPitch1 + x]);
+              memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2);
+              break;
+            default:
+              return(XAI_ERR_NO_VARIANT);
+              break;
+          }
+        }
+#endif
+      } /* end (x = 0; x < dim1Size; x++) loop */
+    }   /* end (y = 0; y < dim2Size; y++) loop */
+  }     /* end (z = 0; z < dim3Size; z++) loop */
+
+  return(XAI_ERROR_STATUS());
+}
+#endif // #if (((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) || ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)))
+/* ----------------------------------------------------------------------------------------------------------------------- */
+#endif // #if XCHAL_HAVE_VISION
+/* ----------------------------------------------------------------------------------------------------------------------- */
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_cast.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_cast.h
new file mode 100644
index 00000000000..22b20bdec10
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_cast.h
@@ -0,0 +1,1890 @@
+/*
+ * Copyright (c) 2025 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn_common.h"
+
+#ifndef IVP_UNPKU2NX8U_L
+#define IVP_UNPKU2NX8U_L(vecIn)  xb_vecNx16_rtor_xb_vecNx16U(IVP_MOVNX16_FROM2NX8U(IVP_SEL2NX8UI(0, vecIn, IVP_SELI_8B_INTERLEAVE_1_LO)))
+#endif
+
+#ifndef IVP_UNPKU2NX8_L
+#define IVP_UNPKU2NX8_L(vecIn)  IVP_MOVNX16_FROM2NX8U(IVP_SEL2NX8UI(0, vecIn, IVP_SELI_8B_INTERLEAVE_1_LO))
+#endif
+
+#ifndef IVP_UNPKS2NX8_L
+#define IVP_UNPKS2NX8_L(vecIn)  IVP_MOVNX16_FROM2NX8(IVP_SEL2NX8I(IVP_SRAI2NX8(vecIn, 7), vecIn, IVP_SELI_8B_INTERLEAVE_1_LO))
+#endif
+
+#ifndef IVP_UNPKUNX16U_L
+#define IVP_UNPKUNX16U_L(vecIn)  IVP_MOVN_2X32U_FROMNX16(IVP_SELNX16UI(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO))
+#endif
+
+#ifndef IVP_UNPKSNX16_L
+#define IVP_UNPKSNX16_L(vecIn)  IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(IVP_SRAINX16(vecIn, 15), vecIn, IVP_SELI_16B_INTERLEAVE_1_LO))
+#endif
+
+#define UNPKSNX8_L(vecIn)   IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32I(IVP_SRAIN_2X32(IVP_UNPKSNX16_L(IVP_UNPKS2NX8_L(vecIn)), 31), IVP_UNPKSNX16_L(IVP_UNPKS2NX8_L(vecIn)), IVP_SELI_32B_INTERLEAVE_1_LO))
+#define UNPKSNX16_L(vecIn)  IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32I(IVP_SRAIN_2X32(IVP_UNPKSNX16_L(vecIn), 31), IVP_UNPKSNX16_L(vecIn), IVP_SELI_32B_INTERLEAVE_1_LO))
+#define UNPKSNX32_L(vecIn)  IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32I(IVP_SRAIN_2X32(vecIn, 31), vecIn, IVP_SELI_32B_INTERLEAVE_1_LO))
+
+#if IN_DATA_TYPE == UNSIGNED8BIT
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_VECTOR
+#undef MORPH_IP_PRIME
+#undef MORPH_IP_VAR_LOAD
+
+#define MORPH_IDT_SCALAR   uint8_t
+#define MORPH_IDT_VECTOR   xb_vec2Nx8U
+#define MORPH_IP_PRIME     IVP_LA2NX8U_PP
+#define MORPH_IP_VAR_LOAD  IVP_LAV2NX8U_XP
+
+#if OUT_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU8ToS8
+#define MORPH_VECTORIZATIONWIDTH  2 * XCHAL_IVPN_SIMD_WIDTH
+#define MORPH_ODT_SCALAR          int8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vec2Nx8U_rtor_xb_vec2Nx8(vecIn);
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU8ToU16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH
+#define MORPH_ODT_SCALAR          uint16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_UNPKU2NX8U_L(vecIn)
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16U_FP
+
+#elif OUT_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU8ToS16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH
+#define MORPH_ODT_SCALAR          int16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_UNPKU2NX8_L(vecIn)
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU8ToU32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          uint32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32Uv
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_UNPKUNX16U_L(IVP_UNPKU2NX8U_L(vecIn))
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32U_FP
+
+#elif OUT_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU8ToS32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          int32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32v
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_UNPKUNX16U_L(IVP_UNPKU2NX8U_L(vecIn))
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU8ToU64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          uint64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_MOVN_4X64U_FROMNX16(IVP_MOVNX16_FROM2NX8U(IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, vecIn, IVP_SELI_8B_INTERLEAVE_1_LO), IVP_SELI_8B_INTERLEAVE_1_LO), IVP_SELI_8B_INTERLEAVE_1_LO)));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64U_FP
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#elif OUT_DATA_TYPE == SIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU8ToS64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          int64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_MOVN_4X64_FROMNX16(IVP_MOVNX16_FROM2NX8U(IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, vecIn, IVP_SELI_8B_INTERLEAVE_1_LO), IVP_SELI_8B_INTERLEAVE_1_LO), IVP_SELI_8B_INTERLEAVE_1_LO)));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64_FP
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#elif OUT_DATA_TYPE == FLOAT16BIT
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU8ToF16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          xb_f16
+#define MORPH_ODT_VECTOR          xb_vecNxf16
+#define MORPH_IDT_CAST(vecIn, vecOut)                                               \
+  xb_vecN_2x32Uv temp = IVP_UNPKUNX16U_L(IVP_UNPKU2NX8U_L(vecIn));                  \
+  xb_vecNxf16 temp1 = IVP_CVTF16N_2XF32_0(xb_vecN_2x32Uv_rtor_xb_vecN_2xf32(temp)); \
+  vecOut = IVP_SELNXF16I(0, temp1, IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNXF16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNXF16_FP
+#endif
+
+#elif OUT_DATA_TYPE == FLOAT32BIT
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU8ToF32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          float
+#define MORPH_ODT_VECTOR          xb_vecN_2xf32
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_2x32Uv_rtor_xb_vecN_2xf32(IVP_UNPKUNX16U_L(IVP_UNPKU2NX8U_L(vecIn)));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2XF32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2XF32_FP
+#endif
+#endif
+
+
+#elif IN_DATA_TYPE == SIGNED8BIT
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_VECTOR
+#undef MORPH_IP_PRIME
+#undef MORPH_IP_VAR_LOAD
+
+#define MORPH_IDT_SCALAR   int8_t
+#define MORPH_IDT_VECTOR   xb_vec2Nx8
+#define MORPH_IP_PRIME     IVP_LA2NX8_PP
+#define MORPH_IP_VAR_LOAD  IVP_LAV2NX8_XP
+
+#if OUT_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS8ToU8
+#define MORPH_VECTORIZATIONWIDTH  2 * XCHAL_IVPN_SIMD_WIDTH
+#define MORPH_ODT_SCALAR          uint8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vec2Nx8_rtor_xb_vec2Nx8U(vecIn);
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8U_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS8ToU16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH
+#define MORPH_ODT_SCALAR          uint16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecNx16_rtor_xb_vecNx16U(IVP_UNPKS2NX8_L(vecIn));
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16U_FP
+
+#elif OUT_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS8ToS16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH
+#define MORPH_ODT_SCALAR          int16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_UNPKS2NX8_L(vecIn);
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS8ToU32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          uint32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32Uv
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_UNPKSNX16_L((IVP_UNPKS2NX8_L(vecIn))));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32U_FP
+
+#elif OUT_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS8ToS32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          int32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32v
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_UNPKSNX16_L((IVP_UNPKS2NX8_L(vecIn)));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS8ToU64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          uint64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_MOVN_4X64U_FROMNX16(UNPKSNX8_L(vecIn));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64U_FP
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#elif OUT_DATA_TYPE == SIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS8ToS64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          int64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_MOVN_4X64_FROMNX16(UNPKSNX8_L(vecIn));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64_FP
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#elif OUT_DATA_TYPE == FLOAT16BIT
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS8ToF16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          xb_f16
+#define MORPH_ODT_VECTOR          xb_vecNxf16
+#define MORPH_IDT_CAST(vecIn, vecOut)                                              \
+  xb_vecN_2x32v temp = IVP_UNPKSNX16_L(IVP_UNPKS2NX8_L(vecIn));                    \
+  xb_vecNxf16 temp1 = IVP_CVTF16N_2XF32_0(xb_vecN_2x32v_rtor_xb_vecN_2xf32(temp)); \
+  vecOut = IVP_SELNXF16I(0, temp1, IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNXF16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNXF16_FP
+#endif
+
+#elif OUT_DATA_TYPE == FLOAT32BIT
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS8ToF32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          float
+#define MORPH_ODT_VECTOR          xb_vecN_2xf32
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_2x32v_rtor_xb_vecN_2xf32(IVP_UNPKSNX16_L(IVP_UNPKS2NX8_L(vecIn)));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2XF32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2XF32_FP
+#endif
+#endif
+
+
+#elif IN_DATA_TYPE == UNSIGNED16BIT
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_VECTOR
+#undef MORPH_IP_PRIME
+#undef MORPH_IP_VAR_LOAD
+
+#define MORPH_IDT_SCALAR   uint16_t
+#define MORPH_IDT_VECTOR   xb_vecNx16U
+#define MORPH_IP_PRIME     IVP_LANX16U_PP
+#define MORPH_IP_VAR_LOAD  IVP_LAVNX16U_XP
+
+#if OUT_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU16ToU8
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH
+#define MORPH_ODT_SCALAR          uint8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_MOV2NX8U_FROMNX16(IVP_SELNX16UI(0, vecIn, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0));
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8U_FP
+
+#elif OUT_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU16ToS8
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH
+#define MORPH_ODT_SCALAR          int8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_MOV2NX8_FROMNX16(IVP_SELNX16UI(0, vecIn, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0));
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8_FP
+
+#elif OUT_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU16ToS16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH
+#define MORPH_ODT_SCALAR          int16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecNx16U_rtor_xb_vecNx16(vecIn);
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU16ToU32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          uint32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32Uv
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_UNPKUNX16U_L(vecIn);
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32U_FP
+
+#elif OUT_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU16ToS32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          int32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32v
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_2x32Uv_rtor_xb_vecN_2x32v(IVP_UNPKUNX16U_L(vecIn));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU16ToU64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          uint64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_MOVN_4X64U_FROMNX16(IVP_SELNX16UI(0, IVP_SELNX16UI(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO), IVP_SELI_16B_INTERLEAVE_1_LO));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64U_FP
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#elif OUT_DATA_TYPE == SIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU16ToS64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          int64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_MOVN_4X64_FROMNX16(IVP_SELNX16UI(0, IVP_SELNX16UI(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO), IVP_SELI_16B_INTERLEAVE_1_LO));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64_FP
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#elif OUT_DATA_TYPE == FLOAT16BIT
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU16ToF16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          xb_f16
+#define MORPH_ODT_VECTOR          xb_vecNxf16
+#define MORPH_IDT_CAST(vecIn, vecOut)                                                                 \
+  xb_vecNxf16 temp = IVP_CVTF16N_2XF32_0(xb_vecN_2x32Uv_rtor_xb_vecN_2xf32(IVP_UNPKUNX16U_L(vecIn))); \
+  vecOut           = IVP_SELNXF16I(0, temp, IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNXF16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNXF16_FP
+#endif
+
+#elif OUT_DATA_TYPE == FLOAT32BIT
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU16ToF32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          float
+#define MORPH_ODT_VECTOR          xb_vecN_2xf32
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_2x32Uv_rtor_xb_vecN_2xf32(IVP_UNPKUNX16U_L(vecIn));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2XF32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2XF32_FP
+#endif
+#endif
+
+
+#elif IN_DATA_TYPE == SIGNED16BIT
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_VECTOR
+#undef MORPH_IP_PRIME
+#undef MORPH_IP_VAR_LOAD
+
+#define MORPH_IDT_SCALAR   int16_t
+#define MORPH_IDT_VECTOR   xb_vecNx16
+#define MORPH_IP_PRIME     IVP_LANX16_PP
+#define MORPH_IP_VAR_LOAD  IVP_LAVNX16_XP
+
+#if OUT_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS16ToU8
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH
+#define MORPH_ODT_SCALAR          uint8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_MOV2NX8U_FROMNX16(IVP_SELNX16I(0, vecIn, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0));
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8U_FP
+
+#elif OUT_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS16ToS8
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH
+#define MORPH_ODT_SCALAR          int8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_MOV2NX8_FROMNX16(IVP_SELNX16I(0, vecIn, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0));
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS16ToU16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH
+#define MORPH_ODT_SCALAR          uint16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecNx16_rtor_xb_vecNx16U(vecIn);
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16U_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS16ToU32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          uint32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32Uv
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_UNPKSNX16_L(vecIn));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32U_FP
+
+#elif OUT_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS16ToS32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          int32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32v
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_UNPKSNX16_L(vecIn);
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS16ToU64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          uint64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_MOVN_4X64U_FROMNX16(UNPKSNX16_L(vecIn));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64U_FP
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#elif OUT_DATA_TYPE == SIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS16ToS64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          int64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_MOVN_4X64_FROMNX16(UNPKSNX16_L(vecIn));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64_FP
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#elif OUT_DATA_TYPE == FLOAT16BIT
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS16ToF16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          xb_f16
+#define MORPH_ODT_VECTOR          xb_vecNxf16
+#define MORPH_IDT_CAST(vecIn, vecOut)                                                               \
+  xb_vecNxf16 temp = IVP_CVTF16N_2XF32_0(xb_vecN_2x32v_rtor_xb_vecN_2xf32(IVP_UNPKSNX16_L(vecIn))); \
+  vecOut           = IVP_SELNXF16I(0, temp, IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNXF16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNXF16_FP
+#endif
+
+#elif OUT_DATA_TYPE == FLOAT32BIT
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS16ToF32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          float
+#define MORPH_ODT_VECTOR          xb_vecN_2xf32
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_2x32v_rtor_xb_vecN_2xf32(IVP_UNPKSNX16_L(vecIn));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2XF32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2XF32_FP
+#endif
+#endif
+
+
+#elif IN_DATA_TYPE == UNSIGNED32BIT
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_VECTOR
+#undef MORPH_IP_PRIME
+#undef MORPH_IP_VAR_LOAD
+
+#define MORPH_IDT_SCALAR   uint32_t
+#define MORPH_IDT_VECTOR   xb_vecN_2x32Uv
+#define MORPH_IP_PRIME     IVP_LAN_2X32U_PP
+#define MORPH_IP_VAR_LOAD  IVP_LAVN_2X32U_XP
+
+#if OUT_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU32ToU8
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          uint8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32U(vecIn)), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8U_FP
+
+#elif OUT_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU32ToS8
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          int8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SEL2NX8I(0, IVP_SEL2NX8I(0, IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32U(vecIn)), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU32ToU16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          uint16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SELNX16UI(0, IVP_MOVNX16_FROMN_2X32U(vecIn), IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16U_FP
+
+#elif OUT_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU32ToS16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          int16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32U(vecIn), IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16_FP
+
+#elif OUT_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU32ToS32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          int32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32v
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_2x32Uv_rtor_xb_vecN_2x32v(vecIn);
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU32ToU64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          uint64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_MOVN_4X64U_FROMNX16(IVP_MOVNX16_FROMN_2X32U(IVP_SELN_2X32UI(0, vecIn, IVP_SELI_32B_INTERLEAVE_1_LO)));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64U_FP
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#elif OUT_DATA_TYPE == SIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU32ToS64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          int64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_MOVN_4X64_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32I(0, vecIn, IVP_SELI_32B_INTERLEAVE_1_LO)));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64_FP
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#elif OUT_DATA_TYPE == FLOAT16BIT
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU32ToF16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          xb_f16
+#define MORPH_ODT_VECTOR          xb_vecNxf16
+#define MORPH_IDT_CAST(vecIn, vecOut)                                               \
+  xb_vecNxf16 temp = IVP_CVTF16N_2XF32_0(xb_vecN_2x32Uv_rtor_xb_vecN_2xf32(vecIn)); \
+  vecOut           = IVP_SELNXF16I(0, temp, IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNXF16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNXF16_FP
+#endif
+
+#elif OUT_DATA_TYPE == FLOAT32BIT
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU32ToF32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          float
+#define MORPH_ODT_VECTOR          xb_vecN_2xf32
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_2x32Uv_rtor_xb_vecN_2xf32(vecIn);
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2XF32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2XF32_FP
+#endif
+#endif
+
+
+#elif IN_DATA_TYPE == SIGNED32BIT
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_VECTOR
+#undef MORPH_IP_PRIME
+#undef MORPH_IP_VAR_LOAD
+
+#define MORPH_IDT_SCALAR   int32_t
+#define MORPH_IDT_VECTOR   xb_vecN_2x32v
+#define MORPH_IP_PRIME     IVP_LAN_2X32_PP
+#define MORPH_IP_VAR_LOAD  IVP_LAVN_2X32_XP
+
+#if OUT_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS32ToU8
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          uint8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(vecIn)), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8U_FP
+
+#elif OUT_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS32ToS8
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          int8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SEL2NX8I(0, IVP_SEL2NX8I(0, IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(vecIn)), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS32ToU16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          uint16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SELNX16UI(0, IVP_MOVNX16_FROMN_2X32(vecIn), IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16U_FP
+
+#elif OUT_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS32ToS16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          int16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(vecIn), IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS32ToU32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          uint32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32Uv
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(vecIn);
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32U_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS32ToU64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          uint64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_MOVN_4X64U_FROMNX16(UNPKSNX32_L(vecIn));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64U_FP
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#elif OUT_DATA_TYPE == SIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS32ToS64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          int64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_MOVN_4X64_FROMNX16(UNPKSNX32_L(vecIn));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64_FP
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#elif OUT_DATA_TYPE == FLOAT16BIT
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS32ToF16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          xb_f16
+#define MORPH_ODT_VECTOR          xb_vecNxf16
+#define MORPH_IDT_CAST(vecIn, vecOut)                                              \
+  xb_vecNxf16 temp = IVP_CVTF16N_2XF32_0(xb_vecN_2x32v_rtor_xb_vecN_2xf32(vecIn)); \
+  vecOut           = IVP_SELNXF16I(0, temp, IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNXF16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNXF16_FP
+#endif
+
+#elif OUT_DATA_TYPE == FLOAT32BIT
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS32ToF32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          float
+#define MORPH_ODT_VECTOR          xb_vecN_2xf32
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_2x32v_rtor_xb_vecN_2xf32(vecIn);
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2XF32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2XF32_FP
+#endif
+#endif
+
+
+#elif IN_DATA_TYPE == UNSIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_VECTOR
+#undef MORPH_IP_PRIME
+#undef MORPH_IP_VAR_LOAD
+
+#define MORPH_IDT_SCALAR   uint64_t
+#define MORPH_IDT_VECTOR   xb_vecN_4x64U
+#define MORPH_IP_PRIME     IVP_LAN_4X64U_PP
+#define MORPH_IP_VAR_LOAD  IVP_LAVN_4X64U_XP
+
+#if OUT_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU64ToU8
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          uint8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_4X64U(vecIn)), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8U_FP
+
+#elif OUT_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU64ToS8
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          int8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SEL2NX8I(0, IVP_SEL2NX8I(0, IVP_SEL2NX8I(0, IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_4X64U(vecIn)), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU64ToU16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          uint16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SELNX16UI(0, IVP_SELNX16UI(0, IVP_MOVNX16_FROMN_4X64U(vecIn), IVP_SELI_16B_DEINTERLEAVE_1_EVEN), IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16U_FP
+
+#elif OUT_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU64ToS16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          int16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SELNX16I(0, IVP_SELNX16I(0, IVP_MOVNX16_FROMN_4X64U(vecIn), IVP_SELI_16B_DEINTERLEAVE_1_EVEN), IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU64ToU32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          uint32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32Uv
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SELN_2X32UI(0, IVP_MOVN_2X32U_FROMNX16(IVP_MOVNX16_FROMN_4X64U(vecIn)), IVP_SELI_32B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32U_FP
+
+#elif OUT_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU64ToS32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          int32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32v
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SELN_2X32I(0, IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROMN_4X64U(vecIn)), IVP_SELI_32B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32_FP
+
+#elif OUT_DATA_TYPE == SIGNED64BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU64ToS64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          int64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_4x64U_rtor_xb_vecN_4x64(vecIn);
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64_FP
+
+#elif OUT_DATA_TYPE == FLOAT16BIT
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU64ToF16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          xb_f16
+#define MORPH_ODT_VECTOR          xb_vecNxf16
+#define MORPH_IDT_CAST(vecIn, vecOut)                                                                                                                  \
+  xb_vecNxf16 temp = IVP_CVTF16N_2XF32_0(IVP_SELN_2X32I(0, IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROMN_4X64U(vecIn)), IVP_SELI_32B_DEINTERLEAVE_1_EVEN)); \
+  vecOut           = IVP_SELNXF16I(0, temp, IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNXF16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNXF16_FP
+#endif
+
+#elif OUT_DATA_TYPE == FLOAT32BIT
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromU64ToF32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          float
+#define MORPH_ODT_VECTOR          xb_vecN_2xf32
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_2x32v_rtor_xb_vecN_2xf32(IVP_SELN_2X32I(0, IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROMN_4X64U(vecIn)), IVP_SELI_32B_DEINTERLEAVE_1_EVEN));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2XF32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2XF32_FP
+#endif
+#endif //#ifdef IVP_LAVN_4X64U_XP
+#endif
+
+
+#elif IN_DATA_TYPE == SIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_VECTOR
+#undef MORPH_IP_PRIME
+#undef MORPH_IP_VAR_LOAD
+
+#define MORPH_IDT_SCALAR   int64_t
+#define MORPH_IDT_VECTOR   xb_vecN_4x64
+#define MORPH_IP_PRIME     IVP_LAN_4X64_PP
+#define MORPH_IP_VAR_LOAD  IVP_LAVN_4X64_XP
+
+#if OUT_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS64ToU8
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          uint8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_4X64(vecIn)), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8U_FP
+
+#elif OUT_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS64ToS8
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          int8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SEL2NX8I(0, IVP_SEL2NX8I(0, IVP_SEL2NX8I(0, IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_4X64(vecIn)), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS64ToU16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          uint16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SELNX16UI(0, IVP_SELNX16UI(0, IVP_MOVNX16_FROMN_4X64(vecIn), IVP_SELI_16B_DEINTERLEAVE_1_EVEN), IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16U_FP
+
+#elif OUT_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS64ToS16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          int16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SELNX16I(0, IVP_SELNX16I(0, IVP_MOVNX16_FROMN_4X64(vecIn), IVP_SELI_16B_DEINTERLEAVE_1_EVEN), IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS64ToU32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          uint32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32Uv
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SELN_2X32UI(0, IVP_MOVN_2X32U_FROMNX16(IVP_MOVNX16_FROMN_4X64(vecIn)), IVP_SELI_32B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32U_FP
+
+#elif OUT_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS64ToS32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          int32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32v
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SELN_2X32I(0, IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROMN_4X64(vecIn)), IVP_SELI_32B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED64BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS64ToU64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          uint64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_4x64_rtor_xb_vecN_4x64U(vecIn);
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64U_FP
+
+#elif OUT_DATA_TYPE == FLOAT16BIT
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS64ToF16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          xb_f16
+#define MORPH_ODT_VECTOR          xb_vecNxf16
+#define MORPH_IDT_CAST(vecIn, vecOut)                                                                                                                 \
+  xb_vecNxf16 temp = IVP_CVTF16N_2XF32_0(IVP_SELN_2X32I(0, IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROMN_4X64(vecIn)), IVP_SELI_32B_DEINTERLEAVE_1_EVEN)); \
+  vecOut           = IVP_SELNXF16I(0, temp, IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNXF16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNXF16_FP
+#endif
+
+#elif OUT_DATA_TYPE == FLOAT32BIT
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromS64ToF32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          float
+#define MORPH_ODT_VECTOR          xb_vecN_2xf32
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_2x32v_rtor_xb_vecN_2xf32(IVP_SELN_2X32I(0, IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROMN_4X64(vecIn)), IVP_SELI_32B_DEINTERLEAVE_1_EVEN));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2XF32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2XF32_FP
+#endif
+#endif //#ifdef IVP_LAVN_4X64U_XP
+#endif
+
+
+#elif IN_DATA_TYPE == FLOAT16BIT
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_VECTOR
+#undef MORPH_IP_PRIME
+#undef MORPH_IP_VAR_LOAD
+
+#define MORPH_IDT_SCALAR   xb_f16
+#define MORPH_IDT_VECTOR   xb_vecNxf16
+#define MORPH_IP_PRIME     IVP_LANXF16_PP
+#define MORPH_IP_VAR_LOAD  IVP_LAVNXF16_XP
+
+#if OUT_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF16ToU8
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          uint8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8U
+#define MORPH_IDT_CAST(vecIn, vecOut)                                                            \
+  xb_vecN_2xf32 temp = IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO)); \
+  vecOut             = IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(xb_vecN_2xf32_rtor_xb_vecN_2x32v(temp))), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8U_FP
+
+#elif OUT_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF16ToS8
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          int8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8
+#define MORPH_IDT_CAST(vecIn, vecOut)                                                            \
+  xb_vecN_2xf32 temp = IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO)); \
+  vecOut             = IVP_SEL2NX8I(0, IVP_SEL2NX8I(0, IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(xb_vecN_2xf32_rtor_xb_vecN_2x32v(temp))), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF16ToU16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          uint16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16U
+#define MORPH_IDT_CAST(vecIn, vecOut)                                                            \
+  xb_vecN_2xf32 temp = IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO)); \
+  vecOut             = IVP_SELNX16UI(0, IVP_MOVNX16_FROMN_2X32(xb_vecN_2xf32_rtor_xb_vecN_2x32v(temp)), IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16U_FP
+
+#elif OUT_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF16ToS16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          int16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16
+#define MORPH_IDT_CAST(vecIn, vecOut)                                                            \
+  xb_vecN_2xf32 temp = IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO)); \
+  vecOut             = IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(xb_vecN_2xf32_rtor_xb_vecN_2x32v(temp)), IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF16ToU32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          uint32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32Uv
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_2xf32_rtor_xb_vecN_2x32Uv(IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO)));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32U_FP
+
+#elif OUT_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF16ToS32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          int32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32v
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_2xf32_rtor_xb_vecN_2x32v(IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO)));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF16ToU64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          uint64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64U
+#define MORPH_IDT_CAST(vecIn, vecOut)                                                                                                \
+  xb_vecN_2x32v vecInp = xb_vecN_2xf32_rtor_xb_vecN_2x32v(IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO))); \
+  xb_vecN_2x32v sign = IVP_SRAIN_2X32(vecInp, 31);                                                                                   \
+  vecOut = IVP_MOVN_4X64U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32UI(sign, vecInp, IVP_SELI_32B_INTERLEAVE_1_LO)));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64U_FP
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#elif OUT_DATA_TYPE == SIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF16ToS64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          int64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64
+#define MORPH_IDT_CAST(vecIn, vecOut)                                                                                                \
+  xb_vecN_2x32v vecInp = xb_vecN_2xf32_rtor_xb_vecN_2x32v(IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO))); \
+  xb_vecN_2x32v sign = IVP_SRAIN_2X32(vecInp, 31);                                                                                   \
+  vecOut = IVP_MOVN_4X64_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32UI(sign, vecInp, IVP_SELI_32B_INTERLEAVE_1_LO)));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64_FP
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#elif OUT_DATA_TYPE == FLOAT32BIT
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF16ToF32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          float
+#define MORPH_ODT_VECTOR          xb_vecN_2xf32
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2XF32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2XF32_FP
+#endif
+#endif
+#endif
+
+
+#elif IN_DATA_TYPE == FLOAT32BIT
+#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))
+#undef MORPH_IDT_SCALAR
+#undef MORPH_IDT_VECTOR
+#undef MORPH_IP_PRIME
+#undef MORPH_IP_VAR_LOAD
+
+#define MORPH_IDT_SCALAR   float
+#define MORPH_IDT_VECTOR   xb_vecN_2xf32
+#define MORPH_IP_PRIME     IVP_LAN_2XF32_PP
+#define MORPH_IP_VAR_LOAD  IVP_LAVN_2XF32_XP
+
+#if OUT_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF32ToU8
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          uint8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(xb_vecN_2xf32_rtor_xb_vecN_2x32v(vecIn))), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8U_FP
+
+#elif OUT_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF32ToS8
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          int8_t
+#define MORPH_ODT_VECTOR          xb_vec2Nx8
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SEL2NX8I(0, IVP_SEL2NX8I(0, IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(xb_vecN_2xf32_rtor_xb_vecN_2x32v(vecIn))), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
+#define MORPH_OP_VAR_STORE        IVP_SAV2NX8_XP
+#define MORPH_OP_FLUSH            IVP_SAPOS2NX8_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF32ToU16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          uint16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16U
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SELNX16UI(0, IVP_MOVNX16_FROMN_2X32(xb_vecN_2xf32_rtor_xb_vecN_2x32v(vecIn)), IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16U_FP
+
+#elif OUT_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF32ToS16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          int16_t
+#define MORPH_ODT_VECTOR          xb_vecNx16
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(xb_vecN_2xf32_rtor_xb_vecN_2x32v(vecIn)), IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNX16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNX16_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF32ToU32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          uint32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32Uv
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_2xf32_rtor_xb_vecN_2x32Uv(vecIn);
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32U_FP
+
+#elif OUT_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF32ToS32
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          int32_t
+#define MORPH_ODT_VECTOR          xb_vecN_2x32v
+#define MORPH_IDT_CAST(vecIn, vecOut)  vecOut = xb_vecN_2xf32_rtor_xb_vecN_2x32v(vecIn);
+#define MORPH_OP_VAR_STORE        IVP_SAVN_2X32_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_2X32_FP
+
+#elif OUT_DATA_TYPE == UNSIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF32ToU64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          uint64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64U
+#define MORPH_IDT_CAST(vecIn, vecOut)                              \
+  xb_vecN_2x32v vecInp = xb_vecN_2xf32_rtor_xb_vecN_2x32Uv(vecIn); \
+  xb_vecN_2x32v sign = IVP_SRAIN_2X32(vecInp, 31);                 \
+  vecOut = IVP_MOVN_4X64U_FROMNX16(IVP_MOVNX16_FROMN_2X32U(IVP_SELN_2X32UI(sign, vecInp, IVP_SELI_32B_INTERLEAVE_1_LO)));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64U_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64U_FP
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#elif OUT_DATA_TYPE == SIGNED64BIT
+#ifdef IVP_LAVN_4X64U_XP
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF32ToS64
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 4
+#define MORPH_ODT_SCALAR          int64_t
+#define MORPH_ODT_VECTOR          xb_vecN_4x64
+#define MORPH_IDT_CAST(vecIn, vecOut)                             \
+  xb_vecN_2x32v vecInp = xb_vecN_2xf32_rtor_xb_vecN_2x32v(vecIn); \
+  xb_vecN_2x32v sign = IVP_SRAIN_2X32(vecInp, 31);                \
+  vecOut = IVP_MOVN_4X64_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32UI(sign, vecInp, IVP_SELI_32B_INTERLEAVE_1_LO)));
+#define MORPH_OP_VAR_STORE        IVP_SAVN_4X64_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSN_4X64_FP
+#endif //#ifdef IVP_LAVN_4X64U_XP
+
+#elif OUT_DATA_TYPE == FLOAT16BIT
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1))
+#undef MAKE_NAME
+#undef MORPH_VECTORIZATIONWIDTH
+#undef MORPH_ODT_SCALAR
+#undef MORPH_ODT_VECTOR
+#undef MORPH_IDT_CAST
+#undef MORPH_OP_VAR_STORE
+#undef MORPH_OP_FLUSH
+
+#define MAKE_NAME(name)  name ## FromF32ToF16
+#define MORPH_VECTORIZATIONWIDTH  XCHAL_IVPN_SIMD_WIDTH / 2
+#define MORPH_ODT_SCALAR          xb_f16
+#define MORPH_ODT_VECTOR          xb_vecNxf16
+#define MORPH_IDT_CAST(vecIn, vecOut)            \
+  xb_vecNxf16 temp = IVP_CVTF16N_2XF32_0(vecIn); \
+  vecOut           = IVP_SELNXF16I(0, temp, IVP_SELI_16B_DEINTERLEAVE_1_EVEN);
+#define MORPH_OP_VAR_STORE        IVP_SAVNXF16_XP
+#define MORPH_OP_FLUSH            IVP_SAPOSNXF16_FP
+#endif
+#endif
+#endif
+#endif
+
+/**************************** xaiCast3D *****************************************/
+/* Description  :  Data casting implementation for input and output data Type   */
+/*               of S8, U8, S16, U16, S32, U32, U64, S64, F16 and F32           */
+/* Inputs       : inTile                                                        */
+/* Outputs      : void                                                          */
+/* InOuts       : outTile                                                       */
+/********************************************************************************/
+
+void MAKE_NAME (xaiCast3D)(const xai_pTile3D inTile, xai_pTile3D outTile)
+{
+  /* Get Tile Parameters */
+  const int32_t dim1Size  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size  = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inPitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inPitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  MORPH_IDT_SCALAR* pInput  = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  MORPH_ODT_SCALAR* pOutput = (MORPH_ODT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  MORPH_IDT_VECTOR *__restrict pdvecIn;
+  MORPH_ODT_VECTOR *__restrict pdvecOut;
+
+  valign vaOutData = IVP_ZALIGN();
+  MORPH_IDT_VECTOR vecInData;
+  MORPH_ODT_VECTOR vecOutData;
+
+  if ((inPitch2 == (dim1Size * dim2Size)) && (outPitch2 == inPitch2))
+  {
+    int dimsCount = dim1Size * dim2Size * dim3Size;
+    int x;
+    pdvecIn  = (MORPH_IDT_VECTOR *) pInput;
+    pdvecOut = (MORPH_ODT_VECTOR *) pOutput;
+    valign vaInData = MORPH_IP_PRIME(pdvecIn);
+
+    for (x = 0; x < dimsCount; x += MORPH_VECTORIZATIONWIDTH)
+    {
+      int remLen = MIN2(dimsCount - x, MORPH_VECTORIZATIONWIDTH);
+      MORPH_IP_VAR_LOAD(vecInData, vaInData, pdvecIn, sizeof(MORPH_IDT_SCALAR) * remLen);
+      MORPH_IDT_CAST(vecInData, vecOutData);
+      MORPH_OP_VAR_STORE(vecOutData, vaOutData, pdvecOut, sizeof(MORPH_ODT_SCALAR) * remLen);
+    }
+    MORPH_OP_FLUSH(vaOutData, pdvecOut);
+  }
+  else
+  {
+    int x, y, z;
+    for (z = 0; z < dim3Size; z++)
+    {
+      for (y = 0; y < dim2Size; y++)
+      {
+        MORPH_IDT_SCALAR* pIn  = pInput + z * inPitch2 + y * inPitch1;
+        MORPH_ODT_SCALAR* pOut = pOutput + z * outPitch2 + y * outPitch1;
+
+        pdvecIn  = (MORPH_IDT_VECTOR *) pIn;
+        pdvecOut = (MORPH_ODT_VECTOR *) pOut;
+        valign vaInData = MORPH_IP_PRIME(pdvecIn);
+
+        for (x = 0; x < dim1Size; x += MORPH_VECTORIZATIONWIDTH)
+        {
+          int remLen = MIN2(dim1Size - x, MORPH_VECTORIZATIONWIDTH);
+          MORPH_IP_VAR_LOAD(vecInData, vaInData, pdvecIn, sizeof(MORPH_IDT_SCALAR) * remLen);
+          MORPH_IDT_CAST(vecInData, vecOutData);
+          MORPH_OP_VAR_STORE(vecOutData, vaOutData, pdvecOut, sizeof(MORPH_ODT_SCALAR) * remLen);
+        }
+        MORPH_OP_FLUSH(vaOutData, pdvecOut);
+      } // end of for(y = 0; y < dim2Size; y++)
+    }   // end of for(z = 0; z < dim3Size; z++)
+  }     // end of   if((inPitch2 == (dim1Size * dim2Size)) && (outPitch2 == inPitch2))
+
+  return;
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_cast_scalar.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_cast_scalar.h
new file mode 100644
index 00000000000..39ed7230bc8
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_cast_scalar.h
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2025 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn_common.h"
+
+/**************************** xaiCast3DScalar_I64 *******************************/
+/* Description  :  Data casting scalar implementation for the case when         */
+/*                 input or output data Type are U64 or S64                     */
+/* Inputs       : inTile                                                        */
+/* Outputs      : void                                                          */
+/* InOuts       : outTile                                                       */
+/********************************************************************************/
+
+void xaiCast3DScalar_I64(const xai_pTile3D inTile,
+                         xai_pTile3D outTile)
+{
+  /* Get Tile Parameters */
+  const int32_t dim1Size  = XAI_TILE3D_GET_DIM1(inTile);
+  const int32_t dim2Size  = XAI_TILE3D_GET_DIM2(inTile);
+  const int32_t dim3Size  = XAI_TILE3D_GET_DIM3(inTile);
+  const int32_t inPitch1  = XAI_TILE3D_GET_DIM1_PITCH(inTile);
+  const int32_t inPitch2  = XAI_TILE3D_GET_DIM2_PITCH(inTile);
+  const int32_t outPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  uint8_t *pIn_8bU   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int8_t *pIn_8b     = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  uint16_t *pIn_16bU = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int16_t *pIn_16b   = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  uint32_t *pIn_32bU = (uint32_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int32_t *pIn_32b   = (int32_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  uint64_t *pIn_64bU = (uint64_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+  int64_t *pIn_64b   = (int64_t *) XAI_TILE3D_GET_DATA_PTR(inTile);
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1))
+  xb_f16 *pIn_f16b = (xb_f16 *) XAI_TILE3D_GET_DATA_PTR(inTile);
+#endif
+  float *pIn_f32b = (float *) XAI_TILE3D_GET_DATA_PTR(inTile);
+
+  uint8_t *pout_8bU   = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int8_t *pout_8b     = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  uint16_t *pout_16bU = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int16_t *pout_16b   = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  uint32_t *pout_32bU = (uint32_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int32_t *pout_32b   = (int32_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  uint64_t *pout_64bU = (uint64_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+  int64_t *pout_64b   = (int64_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1))
+  xb_f16 *pout_f16b = (xb_f16 *) XAI_TILE3D_GET_DATA_PTR(outTile);
+#endif
+  float *pout_f32b = (float *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  int32_t x, y, z;
+
+  for (z = 0; z < dim3Size; z++) /* along 3rd dimension */
+  {
+    for (y = 0; y < dim2Size; y++) /* along 2nd dimension */
+    {
+      for (x = 0; x < dim1Size; x++) /* along 1st dimension */
+      {
+        // Conversions to U64
+        if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U64))
+        {
+          switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile)))
+          {
+            // U8 -> U64
+            case XAI_U8:
+              pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S8 -> U64
+            case XAI_S8:
+              pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_8b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U16 -> U64
+            case XAI_U16:
+              pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S16 -> U64
+            case XAI_S16:
+              pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_16b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U32 -> U64
+            case XAI_U32:
+              pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S32 -> U64
+            case XAI_S32:
+              pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> U64
+            case XAI_S64:
+              pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_64b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // F32 -> U64
+            case XAI_F32:
+              pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1))
+            // F16 -> U64
+            case XAI_F16:
+              pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) IVP_CVTF32F16(pIn_f16b[z * inPitch2 + y * inPitch1 + x]);
+              break;
+#endif
+            default:
+              break;
+          }
+        }
+        // Conversions to S64
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64))
+        {
+          switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile)))
+          {
+            // U8 -> S64
+            case XAI_U8:
+              pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S8 -> S64
+            case XAI_S8:
+              pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_8b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U16 -> S64
+            case XAI_U16:
+              pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S16 -> S64
+            case XAI_S16:
+              pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_16b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // U32 -> S64
+            case XAI_U32:
+              pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S32 -> S64
+            case XAI_S32:
+              pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> S64
+            case XAI_U64:
+              pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // F32 -> S64
+            case XAI_F32:
+              pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x];
+              break;
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1))
+            // F16 -> S64
+            case XAI_F16:
+              pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) IVP_CVTF32F16(pIn_f16b[z * inPitch2 + y * inPitch1 + x]);
+              break;
+#endif
+            default:
+              break;
+          }
+        }
+        // Conversions to S32
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U32))
+        {
+          switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile)))
+          {
+            // U64 -> U32
+            case XAI_U64:
+              pout_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> U32
+            case XAI_S64:
+              pout_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_64b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            default:
+              break;
+          }
+        }
+        // Conversions to S32
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32))
+        {
+          switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile)))
+          {
+            // U64 -> S32
+            case XAI_U64:
+              pout_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> S32
+            case XAI_S64:
+              pout_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_64b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            default:
+              break;
+          }
+        }
+        // Conversions to U16
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16))
+        {
+          switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile)))
+          {
+            // U64 -> U16
+            case XAI_U64:
+              pout_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> U16
+            case XAI_S64:
+              pout_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_64b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            default:
+              break;
+          }
+        }
+        // Conversions to S16
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16))
+        {
+          switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile)))
+          {
+            // U64 -> S16
+            case XAI_U64:
+              pout_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> S16
+            case XAI_S64:
+              pout_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_64b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            default:
+              break;
+          }
+        }
+        // Conversions to U8
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U8))
+        {
+          switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile)))
+          {
+            // U64 -> U8
+            case XAI_U64:
+              pout_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> U8
+            case XAI_S64:
+              pout_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_64b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            default:
+              break;
+          }
+        }
+        // Conversions to S8
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8))
+        {
+          switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile)))
+          {
+            // U64 -> S8
+            case XAI_U64:
+              pout_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> S8
+            case XAI_S64:
+              pout_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_64b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            default:
+              break;
+          }
+        }
+        // Conversions to F32
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_F32))
+        {
+          switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile)))
+          {
+            // U64 -> F32
+            case XAI_U64:
+              pout_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_64bU[z * inPitch2 + y * inPitch1 + x];
+              break;
+            // S64 -> F32
+            case XAI_S64:
+              pout_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_64b[z * inPitch2 + y * inPitch1 + x];
+              break;
+            default:
+              break;
+          }
+        }
+        // Conversions to F16
+#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1))
+        else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_F16))
+        {
+          switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile)))
+          {
+            // U64 -> F16
+            case XAI_U64:
+              pout_f16b[z * outPitch2 + y * outPitch1 + x] = IVP_CVTF16F32((float) pIn_64bU[z * inPitch2 + y * inPitch1 + x]);
+              break;
+            // S64 -> F16
+            case XAI_S64:
+              pout_f16b[z * outPitch2 + y * outPitch1 + x] = IVP_CVTF16F32((float) pIn_64b[z * inPitch2 + y * inPitch1 + x]);
+              break;
+            default:
+              break;
+          }
+        }
+#endif
+      } /* end (x = 0; x < dim1Size; x++) loop */
+    }   /* end (y = 0; y < dim2Size; y++) loop */
+  }     /* end (z = 0; z < dim3Size; z++) loop */
+  return;
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_add.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_add.c
new file mode 100644
index 00000000000..9c540ab97eb
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_add.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn_common.h"
+
+
+#define ELTADD_DATA_TYPE  SIGNED8BIT
+#include "cnn_eltwise_add.h"
+#undef ELTADD_DATA_TYPE
+
+#define ELTADD_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_eltwise_add.h"
+#undef ELTADD_DATA_TYPE
+
+#define ELTADD_DATA_TYPE  SIGNED16BIT
+#include "cnn_eltwise_add.h"
+#undef ELTADD_DATA_TYPE
+
+#define ELTADD_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_eltwise_add.h"
+#undef ELTADD_DATA_TYPE
+
+#define ELTADD_DATA_TYPE  SIGNED32BIT
+#include "cnn_eltwise_add.h"
+#undef ELTADD_DATA_TYPE
+
+#define ELTADD_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_eltwise_add.h"
+#undef ELTADD_DATA_TYPE
+
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+#define ELTADD_DATA_TYPE  FLOAT16BIT
+#include "cnn_eltwise_add.h"
+#undef ELTADD_DATA_TYPE
+#endif
+
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+#define ELTADD_DATA_TYPE  FLOAT32BIT
+#include "cnn_eltwise_add.h"
+#undef ELTADD_DATA_TYPE
+#endif
+
+/**************************** xaiEltwiseAdd3D_AV *****************************************/
+/* Description  : General API for auto-vectorizable Broadcast element-wise addition      */
+/*                Calls one of the xaiEltwiseAdd3D_AV functions based on the data type   */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE xaiEltwiseAdd3D_AV(const xai_pTile3D inTile1,
+                                const xai_pTile3D inTile2,
+                                xai_pTile3D outTile)
+{
+  if (!inTile1 || !inTile2 || !outTile)
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8))
+  {
+    return(xaiEltwiseAdd3D_S8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8))
+  {
+    return(xaiEltwiseAdd3D_U8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16))
+  {
+    return(xaiEltwiseAdd3D_S16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16))
+  {
+    return(xaiEltwiseAdd3D_U16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32))
+  {
+    return(xaiEltwiseAdd3D_S32_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32))
+  {
+    return(xaiEltwiseAdd3D_U32_AV(inTile1, inTile2, outTile));
+  }
+
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F16))
+  {
+    return(xaiEltwiseAdd3D_F16_AV(inTile1, inTile2, outTile));
+  }
+#endif
+
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F32))
+  {
+    return(xaiEltwiseAdd3D_F32_AV(inTile1, inTile2, outTile));
+  }
+#endif
+
+  return(XAI_ERR_OK);
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_add.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_add.h
new file mode 100644
index 00000000000..5157aa80360
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_add.h
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn_common.h"
+
+
+#ifndef SIGNED8BIT
+#define SIGNED8BIT     1
+#define UNSIGNED8BIT   2
+#define SIGNED16BIT    3
+#define UNSIGNED16BIT  4
+#define SIGNED32BIT    5
+#define UNSIGNED32BIT  6
+#define FLOAT16BIT     7
+#define FLOAT32BIT     8
+#endif
+
+#if ELTADD_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S8
+#define MORPH_IDT_SCALAR  int8_t
+
+#elif ELTADD_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U8
+#define MORPH_IDT_SCALAR  uint8_t
+
+#elif ELTADD_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S16
+#define MORPH_IDT_SCALAR  int16_t
+
+#elif ELTADD_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U16
+#define MORPH_IDT_SCALAR  uint16_t
+
+#elif ELTADD_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S32
+#define MORPH_IDT_SCALAR  int32_t
+
+#elif ELTADD_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U32
+#define MORPH_IDT_SCALAR  uint32_t
+
+#elif ELTADD_DATA_TYPE == FLOAT16BIT
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _F16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_F16
+#define MORPH_IDT_SCALAR  xb_f16
+#endif
+
+#elif ELTADD_DATA_TYPE == FLOAT32BIT
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _F32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_F32
+#define MORPH_IDT_SCALAR  float
+#endif
+#endif
+
+#define ADD2(a, b)  a + b
+
+
+/**************************** xaiEltwiseAdd3D ********************************************/
+/* Description  : auto-vectorizable implementation of Broadcast element-wise addition    */
+/*               Based on MORPH implementation eight variants are generated for          */
+/*               S8, U8, S16, U16, S32, U32, F16 and F32 data types                      */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE MAKE_NAME (xaiEltwiseAdd3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile1);
+    MORPH_IDT_CHECK(inTile2);
+    MORPH_IDT_CHECK(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2),
+                    XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2));
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile),
+                    XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile));
+    XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1);
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1SizeOut   = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t dim2SizeOut   = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t dim3SizeOut   = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1);
+  MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2);
+  MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  MORPH_IDT_SCALAR *__restrict pIn1;
+  MORPH_IDT_SCALAR *__restrict pIn2;
+  MORPH_IDT_SCALAR *__restrict pOut;
+
+  /* Get Pitch appropriate for elementwise broadcast operations */
+  XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \
+                                inTile2Pitch1, inTile1Pitch2, inTile2Pitch2);
+
+  /* no Broadcast */
+  if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2)
+  {
+    int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut;
+    pIn1 = pInput1;
+    pIn2 = pInput2;
+    pOut = pOutput;
+
+    for (int i = 0; i < dimsCount; i++)
+    {
+      pOut[i] = ADD2(pIn1[i], pIn2[i]);
+    }
+  }
+  else
+  {
+    /*
+       inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting
+       inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting
+       inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting
+       inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting
+       inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting
+       inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting
+     */
+    int32_t y, z, idx;
+
+    for (z = 0; z < dim3SizeOut; z++)
+    {
+      MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2;
+      MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2;
+      MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2;
+
+      for (y = 0; y < dim2SizeOut; y++)
+      {
+        pIn1 = (temp1 + y * inTile1Pitch1);
+        pIn2 = (temp2 + y * inTile2Pitch1);
+        pOut = (temp3 + y * outTilePitch1);
+
+        MORPH_IDT_SCALAR InData1, InData2;
+        /* Tile1 Dimension 1 broadcasting */
+        if (inTile1Pitch0 == 0)
+        {
+          InData1 = pIn1[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData2   = pIn2[idx];
+            pOut[idx] = ADD2(InData1, InData2);
+          }
+        }
+        /* Tile2 Dimension 1 broadcasting */
+        else if (inTile2Pitch0 == 0)
+        {
+          InData2 = pIn2[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1   = pIn1[idx];
+            pOut[idx] = ADD2(InData1, InData2);
+          }
+        }
+        else
+        {
+          /* broadcast in dims 1 or 2 in Tile1 or TIle2 */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1   = pIn1[idx];
+            InData2   = pIn2[idx];
+            pOut[idx] = ADD2(InData1, InData2);
+          }
+        }
+      }
+    }
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_and.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_and.c
new file mode 100644
index 00000000000..ee6c1fe3a3c
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_and.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn_common.h"
+
+#define ELTAND_DATA_TYPE  SIGNED8BIT
+#include "cnn_eltwise_and.h"
+#undef ELTAND_DATA_TYPE
+
+#define ELTAND_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_eltwise_and.h"
+#undef ELTAND_DATA_TYPE
+
+#define ELTAND_DATA_TYPE  SIGNED16BIT
+#include "cnn_eltwise_and.h"
+#undef ELTAND_DATA_TYPE
+
+#define ELTAND_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_eltwise_and.h"
+#undef ELTAND_DATA_TYPE
+
+#define ELTAND_DATA_TYPE  SIGNED32BIT
+#include "cnn_eltwise_and.h"
+#undef ELTAND_DATA_TYPE
+
+#define ELTAND_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_eltwise_and.h"
+#undef ELTAND_DATA_TYPE
+
+
+/**************************** xaiEltwiseAnd3D_AV *****************************************/
+/* Description  : General API for auto-vectorizable Broadcast element-wise and           */
+/*                bitwise AND operator                                                   */
+/*                Calls one of the xaiEltwiseAnd3D_AV functions based on the data type   */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE xaiEltwiseAnd3D_AV(const xai_pTile3D inTile1,
+                                const xai_pTile3D inTile2,
+                                xai_pTile3D outTile)
+{
+  if (!inTile1 || !inTile2 || !outTile)
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8))
+  {
+    return(xaiEltwiseAnd3D_S8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8))
+  {
+    return(xaiEltwiseAnd3D_U8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16))
+  {
+    return(xaiEltwiseAnd3D_S16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16))
+  {
+    return(xaiEltwiseAnd3D_U16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32))
+  {
+    return(xaiEltwiseAnd3D_S32_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32))
+  {
+    return(xaiEltwiseAnd3D_U32_AV(inTile1, inTile2, outTile));
+  }
+
+  return(XAI_ERR_OK);
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_and.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_and.h
new file mode 100644
index 00000000000..de51db0c785
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_and.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn_common.h"
+
+
+#ifndef SIGNED8BIT
+#define SIGNED8BIT     1
+#define UNSIGNED8BIT   2
+#define SIGNED16BIT    3
+#define UNSIGNED16BIT  4
+#define SIGNED32BIT    5
+#define UNSIGNED32BIT  6
+#endif
+
+#if ELTAND_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S8
+#define MORPH_IDT_SCALAR  int8_t
+
+#elif ELTAND_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U8
+#define MORPH_IDT_SCALAR  uint8_t
+
+#elif ELTAND_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S16
+#define MORPH_IDT_SCALAR  int16_t
+
+#elif ELTAND_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U16
+#define MORPH_IDT_SCALAR  uint16_t
+
+#elif ELTAND_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S32
+#define MORPH_IDT_SCALAR  int32_t
+
+#elif ELTAND_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U32
+#define MORPH_IDT_SCALAR  uint32_t
+#endif
+
+#define AND2(a, b)  a & b
+
+
+/**************************** xaiEltwiseAnd3D *********************************************/
+/* Description  : auto-vectorizable implementation of Broadcast elementWise and bitwise  */
+/*               AND operator, Based on MORPH implementation eight variants are          */
+/*               generated for S8, U8, S16, U16, S32, and U32 data types                 */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE MAKE_NAME (xaiEltwiseAnd3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile1);
+    MORPH_IDT_CHECK(inTile2);
+    MORPH_IDT_CHECK(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2),
+                    XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2));
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile),
+                    XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile));
+    XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1);
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1SizeOut   = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t dim2SizeOut   = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t dim3SizeOut   = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1);
+  MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2);
+  MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  MORPH_IDT_SCALAR *__restrict pIn1;
+  MORPH_IDT_SCALAR *__restrict pIn2;
+  MORPH_IDT_SCALAR *__restrict pOut;
+
+  /* Get Pitch appropriate for elementwise broadcast operations */
+  XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \
+                                inTile2Pitch1, inTile1Pitch2, inTile2Pitch2);
+
+  /* no Broadcast */
+  if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2)
+  {
+    int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut;
+    pIn1 = pInput1;
+    pIn2 = pInput2;
+    pOut = pOutput;
+
+    for (int i = 0; i < dimsCount; i++)
+    {
+      pOut[i] = AND2(pIn1[i], pIn2[i]);
+    }
+  }
+  else
+  {
+    /*
+       inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting
+       inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting
+       inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting
+       inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting
+       inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting
+       inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting
+     */
+    int32_t y, z, idx;
+
+    for (z = 0; z < dim3SizeOut; z++)
+    {
+      MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2;
+      MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2;
+      MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2;
+
+      for (y = 0; y < dim2SizeOut; y++)
+      {
+        pIn1 = (temp1 + y * inTile1Pitch1);
+        pIn2 = (temp2 + y * inTile2Pitch1);
+        pOut = (temp3 + y * outTilePitch1);
+
+        MORPH_IDT_SCALAR InData1, InData2;
+        /* Tile1 Dimension 1 broadcasting */
+        if (inTile1Pitch0 == 0)
+        {
+          InData1 = pIn1[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData2   = pIn2[idx];
+            pOut[idx] = AND2(InData1, InData2);
+          }
+        }
+        /* Tile2 Dimension 1 broadcasting */
+        else if (inTile2Pitch0 == 0)
+        {
+          InData2 = pIn2[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1   = pIn1[idx];
+            pOut[idx] = AND2(InData1, InData2);
+          }
+        }
+        else
+        {
+          /* broadcast in dims 1 or 2 in Tile1 or TIle2 */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1   = pIn1[idx];
+            InData2   = pIn2[idx];
+            pOut[idx] = AND2(InData1, InData2);
+          }
+        }
+      }
+    }
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_equal.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_equal.c
new file mode 100644
index 00000000000..a8f18f62f1a
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_equal.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn_common.h"
+
+
+#define ELTEQUAL_DATA_TYPE  SIGNED8BIT
+#include "cnn_eltwise_equal.h"
+#undef ELTEQUAL_DATA_TYPE
+
+#define ELTEQUAL_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_eltwise_equal.h"
+#undef ELTEQUAL_DATA_TYPE
+
+#define ELTEQUAL_DATA_TYPE  SIGNED16BIT
+#include "cnn_eltwise_equal.h"
+#undef ELTEQUAL_DATA_TYPE
+
+#define ELTEQUAL_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_eltwise_equal.h"
+#undef ELTEQUAL_DATA_TYPE
+
+#define ELTEQUAL_DATA_TYPE  SIGNED32BIT
+#include "cnn_eltwise_equal.h"
+#undef ELTEQUAL_DATA_TYPE
+
+#define ELTEQUAL_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_eltwise_equal.h"
+#undef ELTEQUAL_DATA_TYPE
+
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+#define ELTEQUAL_DATA_TYPE  FLOAT16BIT
+#include "cnn_eltwise_equal.h"
+#undef ELTEQUAL_DATA_TYPE
+#endif
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+#define ELTEQUAL_DATA_TYPE  FLOAT32BIT
+#include "cnn_eltwise_equal.h"
+#undef ELTEQUAL_DATA_TYPE
+#endif
+
+
+/**************************** xaiEltwiseEqual3D_AV ***************************************/
+/* Description  : General API for auto-vectorizable Broadcast element-wise               */
+/*                EQUAL operator                                                         */
+/*                Calls one of the xaiEltwiseEqual3D_AV functions based on the data type */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE xaiEltwiseEqual3D_AV(const xai_pTile3D inTile1,
+                                  const xai_pTile3D inTile2,
+                                  xai_pTile3D outTile)
+{
+  if (!inTile1 || !inTile2 || !outTile)
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8))
+  {
+    return(xaiEltwiseEqual3D_S8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8))
+  {
+    return(xaiEltwiseEqual3D_U8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16))
+  {
+    return(xaiEltwiseEqual3D_S16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16))
+  {
+    return(xaiEltwiseEqual3D_U16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32))
+  {
+    return(xaiEltwiseEqual3D_S32_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32))
+  {
+    return(xaiEltwiseEqual3D_U32_AV(inTile1, inTile2, outTile));
+  }
+
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F16))
+  {
+    return(xaiEltwiseEqual3D_F16_AV(inTile1, inTile2, outTile));
+  }
+#endif
+
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F32))
+  {
+    return(xaiEltwiseEqual3D_F32_AV(inTile1, inTile2, outTile));
+  }
+#endif
+
+  return(XAI_ERR_OK);
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_equal.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_equal.h
new file mode 100644
index 00000000000..f6f7efbdc85
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_equal.h
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn_common.h"
+
+
+#ifndef SIGNED8BIT
+#define SIGNED8BIT     1
+#define UNSIGNED8BIT   2
+#define SIGNED16BIT    3
+#define UNSIGNED16BIT  4
+#define SIGNED32BIT    5
+#define UNSIGNED32BIT  6
+#define FLOAT16BIT     7
+#define FLOAT32BIT     8
+#endif
+
+#if ELTEQUAL_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S8
+#define MORPH_IDT_SCALAR  int8_t
+
+#elif ELTEQUAL_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U8
+#define MORPH_IDT_SCALAR  uint8_t
+
+#elif ELTEQUAL_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S16
+#define MORPH_IDT_SCALAR  int16_t
+
+#elif ELTEQUAL_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U16
+#define MORPH_IDT_SCALAR  uint16_t
+
+#elif ELTEQUAL_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S32
+#define MORPH_IDT_SCALAR  int32_t
+
+#elif ELTEQUAL_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U32
+#define MORPH_IDT_SCALAR  uint32_t
+
+#elif ELTEQUAL_DATA_TYPE == FLOAT16BIT
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _F16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_F16
+#define MORPH_IDT_SCALAR  xb_f16
+#endif
+
+#elif ELTEQUAL_DATA_TYPE == FLOAT32BIT
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _F32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_F32
+#define MORPH_IDT_SCALAR  float
+#endif
+#endif
+
+#define EQUAL(a, b)  a == b
+
+
+/**************************** xaiEltwiseEqual3D ******************************************/
+/* Description  : auto-vectorizable implementation of Broadcast elementWise EQUAL        */
+/*               operator, Based on MORPH implementation eight variants are              */
+/*               generated for S8, U8, S16, U16, S32, U32, F16 and F32 data types        */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE MAKE_NAME (xaiEltwiseEqual3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile1);
+    MORPH_IDT_CHECK(inTile2);
+    MORPH_IDT_CHECK(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2),
+                    XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2));
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile),
+                    XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile));
+    XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1);
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1SizeOut   = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t dim2SizeOut   = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t dim3SizeOut   = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1);
+  MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2);
+  MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  MORPH_IDT_SCALAR *__restrict pIn1;
+  MORPH_IDT_SCALAR *__restrict pIn2;
+  MORPH_IDT_SCALAR *__restrict pOut;
+
+  /* Get Pitch appropriate for elementwise broadcast operations */
+  XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \
+                                inTile2Pitch1, inTile1Pitch2, inTile2Pitch2);
+
+  /* no Broadcast */
+  if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2)
+  {
+    int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut;
+    pIn1 = pInput1;
+    pIn2 = pInput2;
+    pOut = pOutput;
+
+    for (int i = 0; i < dimsCount; i++)
+    {
+#if (ELTEQUAL_DATA_TYPE == FLOAT16BIT)
+      bool temp = EQUAL(pIn1[i], pIn2[i]);
+      pOut[i] = temp ? 1 : 0;
+#else
+      pOut[i] = EQUAL(pIn1[i], pIn2[i]);
+#endif
+    }
+  }
+  else
+  {
+    /*
+       inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting
+       inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting
+       inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting
+       inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting
+       inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting
+       inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting
+     */
+    int32_t y, z, idx;
+
+    for (z = 0; z < dim3SizeOut; z++)
+    {
+      MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2;
+      MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2;
+      MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2;
+
+      for (y = 0; y < dim2SizeOut; y++)
+      {
+        pIn1 = (temp1 + y * inTile1Pitch1);
+        pIn2 = (temp2 + y * inTile2Pitch1);
+        pOut = (temp3 + y * outTilePitch1);
+
+        MORPH_IDT_SCALAR InData1, InData2;
+        /* Tile1 Dimension 1 broadcasting */
+        if (inTile1Pitch0 == 0)
+        {
+          InData1 = pIn1[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData2 = pIn2[idx];
+#if (ELTEQUAL_DATA_TYPE == FLOAT16BIT)
+            bool temp = EQUAL(InData1, InData2);
+            pOut[idx] = temp ? 1 : 0;
+#else
+            pOut[idx] = EQUAL(InData1, InData2);
+#endif
+          }
+        }
+        /* Tile2 Dimension 1 broadcasting */
+        else if (inTile2Pitch0 == 0)
+        {
+          InData2 = pIn2[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1 = pIn1[idx];
+#if (ELTEQUAL_DATA_TYPE == FLOAT16BIT)
+            bool temp = EQUAL(InData1, InData2);
+            pOut[idx] = temp ? 1 : 0;
+#else
+            pOut[idx] = EQUAL(InData1, InData2);
+#endif
+          }
+        }
+        else
+        {
+          /* broadcast in dims 1 or 2 in Tile1 or TIle2 */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1 = pIn1[idx];
+            InData2 = pIn2[idx];
+#if (ELTEQUAL_DATA_TYPE == FLOAT16BIT)
+            bool temp = EQUAL(InData1, InData2);
+            pOut[idx] = temp ? 1 : 0;
+#else
+            pOut[idx] = EQUAL(InData1, InData2);
+#endif
+          }
+        }
+      }
+    }
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_greaterthan.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_greaterthan.c
new file mode 100644
index 00000000000..a7eedbf95bc
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_greaterthan.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn_common.h"
+
+
+#define ELT_GREATERTHAN_DATA_TYPE  SIGNED8BIT
+#include "cnn_eltwise_greaterthan.h"
+#undef ELT_GREATERTHAN_DATA_TYPE
+
+#define ELT_GREATERTHAN_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_eltwise_greaterthan.h"
+#undef ELT_GREATERTHAN_DATA_TYPE
+
+#define ELT_GREATERTHAN_DATA_TYPE  SIGNED16BIT
+#include "cnn_eltwise_greaterthan.h"
+#undef ELT_GREATERTHAN_DATA_TYPE
+
+#define ELT_GREATERTHAN_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_eltwise_greaterthan.h"
+#undef ELT_GREATERTHAN_DATA_TYPE
+
+#define ELT_GREATERTHAN_DATA_TYPE  SIGNED32BIT
+#include "cnn_eltwise_greaterthan.h"
+#undef ELT_GREATERTHAN_DATA_TYPE
+
+#define ELT_GREATERTHAN_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_eltwise_greaterthan.h"
+#undef ELT_GREATERTHAN_DATA_TYPE
+
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+#define ELT_GREATERTHAN_DATA_TYPE  FLOAT16BIT
+#include "cnn_eltwise_greaterthan.h"
+#undef ELT_GREATERTHAN_DATA_TYPE
+#endif
+
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+#define ELT_GREATERTHAN_DATA_TYPE  FLOAT32BIT
+#include "cnn_eltwise_greaterthan.h"
+#undef ELT_GREATERTHAN_DATA_TYPE
+#endif
+
+
+/**************************** xaiEltwiseGreaterThan3D_AV ***************************************/
+/* Description  : General API for auto-vectorizable Broadcast element-wise                     */
+/*                GREATER operator                                                             */
+/*                Calls one of the xaiEltwiseGreaterThan3D_AV functions based on the data type */
+/* Inputs       : inTile1, inTile2                                                             */
+/* Outputs      : XI Error Code                                                                */
+/* InOuts       : outTile                                                                      */
+/***********************************************************************************************/
+
+XAI_ERR_TYPE xaiEltwiseGreaterThan3D_AV(const xai_pTile3D inTile1,
+                                        const xai_pTile3D inTile2,
+                                        xai_pTile3D outTile)
+{
+  if (!inTile1 || !inTile2 || !outTile)
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8))
+  {
+    return(xaiEltwiseGreaterThan3D_S8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8))
+  {
+    return(xaiEltwiseGreaterThan3D_U8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16))
+  {
+    return(xaiEltwiseGreaterThan3D_S16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16))
+  {
+    return(xaiEltwiseGreaterThan3D_U16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32))
+  {
+    return(xaiEltwiseGreaterThan3D_S32_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32))
+  {
+    return(xaiEltwiseGreaterThan3D_U32_AV(inTile1, inTile2, outTile));
+  }
+
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F16))
+  {
+    return(xaiEltwiseGreaterThan3D_F16_AV(inTile1, inTile2, outTile));
+  }
+#endif
+
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F32))
+  {
+    return(xaiEltwiseGreaterThan3D_F32_AV(inTile1, inTile2, outTile));
+  }
+#endif
+
+  return(XAI_ERR_OK);
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_greaterthan.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_greaterthan.h
new file mode 100644
index 00000000000..08be132a40c
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_greaterthan.h
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn_common.h"
+
+
+#ifndef SIGNED8BIT
+#define SIGNED8BIT     1
+#define UNSIGNED8BIT   2
+#define SIGNED16BIT    3
+#define UNSIGNED16BIT  4
+#define SIGNED32BIT    5
+#define UNSIGNED32BIT  6
+#define FLOAT16BIT     7
+#define FLOAT32BIT     8
+#endif
+
+#if ELT_GREATERTHAN_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S8
+#define MORPH_IDT_SCALAR  int8_t
+
+#elif ELT_GREATERTHAN_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U8
+#define MORPH_IDT_SCALAR  uint8_t
+
+#elif ELT_GREATERTHAN_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S16
+#define MORPH_IDT_SCALAR  int16_t
+
+#elif ELT_GREATERTHAN_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U16
+#define MORPH_IDT_SCALAR  uint16_t
+
+#elif ELT_GREATERTHAN_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S32
+#define MORPH_IDT_SCALAR  int32_t
+
+#elif ELT_GREATERTHAN_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U32
+#define MORPH_IDT_SCALAR  uint32_t
+
+#elif ELT_GREATERTHAN_DATA_TYPE == FLOAT16BIT
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _F16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_F16
+#define MORPH_IDT_SCALAR  xb_f16
+#endif
+
+#elif ELT_GREATERTHAN_DATA_TYPE == FLOAT32BIT
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _F32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_F32
+#define MORPH_IDT_SCALAR  float
+#endif
+#endif
+
+#define GREATER_THAN(a, b)  a > b
+
+
+/**************************** xaiEltwiseGreaterThan3D ************************************/
+/* Description  : auto-vectorizable implementation of Broadcast elementWise GREATER      */
+/*               operator, Based on MORPH implementation eight variants are              */
+/*               generated for S8, U8, S16, U16, S32, U32, F16 and F32 data types        */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE MAKE_NAME (xaiEltwiseGreaterThan3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile1);
+    MORPH_IDT_CHECK(inTile2);
+    MORPH_IDT_CHECK(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2),
+                    XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2));
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile),
+                    XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile));
+    XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1);
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1SizeOut   = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t dim2SizeOut   = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t dim3SizeOut   = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1);
+  MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2);
+  MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  MORPH_IDT_SCALAR *__restrict pIn1;
+  MORPH_IDT_SCALAR *__restrict pIn2;
+  MORPH_IDT_SCALAR *__restrict pOut;
+
+  /* Get Pitch appropriate for elementwise broadcast operations */
+  XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \
+                                inTile2Pitch1, inTile1Pitch2, inTile2Pitch2);
+
+  /* no Broadcast */
+  if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2)
+  {
+    int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut;
+    pIn1 = pInput1;
+    pIn2 = pInput2;
+    pOut = pOutput;
+
+    for (int i = 0; i < dimsCount; i++)
+    {
+#if (ELT_GREATERTHAN_DATA_TYPE == FLOAT16BIT)
+      bool temp = GREATER_THAN(pIn1[i], pIn2[i]);
+      pOut[i] = temp ? 1 : 0;
+#else
+      pOut[i] = GREATER_THAN(pIn1[i], pIn2[i]);
+#endif
+    }
+  }
+  else
+  {
+    /*
+       inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting
+       inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting
+       inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting
+       inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting
+       inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting
+       inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting
+     */
+    int32_t y, z, idx;
+
+    for (z = 0; z < dim3SizeOut; z++)
+    {
+      MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2;
+      MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2;
+      MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2;
+
+      for (y = 0; y < dim2SizeOut; y++)
+      {
+        pIn1 = (temp1 + y * inTile1Pitch1);
+        pIn2 = (temp2 + y * inTile2Pitch1);
+        pOut = (temp3 + y * outTilePitch1);
+
+        MORPH_IDT_SCALAR InData1, InData2;
+        /* Tile1 Dimension 1 broadcasting */
+        if (inTile1Pitch0 == 0)
+        {
+          InData1 = pIn1[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData2 = pIn2[idx];
+#if (ELT_GREATERTHAN_DATA_TYPE == FLOAT16BIT)
+            bool temp = GREATER_THAN(InData1, InData2);
+            pOut[idx] = temp ? 1 : 0;
+#else
+            pOut[idx] = GREATER_THAN(InData1, InData2);
+#endif
+          }
+        }
+        /* Tile2 Dimension 1 broadcasting */
+        else if (inTile2Pitch0 == 0)
+        {
+          InData2 = pIn2[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1 = pIn1[idx];
+#if (ELT_GREATERTHAN_DATA_TYPE == FLOAT16BIT)
+            bool temp = GREATER_THAN(InData1, InData2);
+            pOut[idx] = temp ? 1 : 0;
+#else
+            pOut[idx] = GREATER_THAN(InData1, InData2);
+#endif
+          }
+        }
+        else
+        {
+          /* broadcast in dims 1 or 2 in Tile1 or TIle2 */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1 = pIn1[idx];
+            InData2 = pIn2[idx];
+#if (ELT_GREATERTHAN_DATA_TYPE == FLOAT16BIT)
+            bool temp = GREATER_THAN(InData1, InData2);
+            pOut[idx] = temp ? 1 : 0;
+#else
+            pOut[idx] = GREATER_THAN(InData1, InData2);
+#endif
+          }
+        }
+      }
+    }
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_lessthan.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_lessthan.c
new file mode 100644
index 00000000000..da896ead3ca
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_lessthan.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn_common.h"
+
+
+#define ELT_LESSTHAN_DATA_TYPE  SIGNED8BIT
+#include "cnn_eltwise_lessthan.h"
+#undef ELT_LESSTHAN_DATA_TYPE
+
+#define ELT_LESSTHAN_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_eltwise_lessthan.h"
+#undef ELT_LESSTHAN_DATA_TYPE
+
+#define ELT_LESSTHAN_DATA_TYPE  SIGNED16BIT
+#include "cnn_eltwise_lessthan.h"
+#undef ELT_LESSTHAN_DATA_TYPE
+
+#define ELT_LESSTHAN_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_eltwise_lessthan.h"
+#undef ELT_LESSTHAN_DATA_TYPE
+
+#define ELT_LESSTHAN_DATA_TYPE  SIGNED32BIT
+#include "cnn_eltwise_lessthan.h"
+#undef ELT_LESSTHAN_DATA_TYPE
+
+#define ELT_LESSTHAN_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_eltwise_lessthan.h"
+#undef ELT_LESSTHAN_DATA_TYPE
+
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+#define ELT_LESSTHAN_DATA_TYPE  FLOAT16BIT
+#include "cnn_eltwise_lessthan.h"
+#undef ELT_LESSTHAN_DATA_TYPE
+#endif
+
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+#define ELT_LESSTHAN_DATA_TYPE  FLOAT32BIT
+#include "cnn_eltwise_lessthan.h"
+#undef ELT_LESSTHAN_DATA_TYPE
+#endif
+
+XAI_ERR_TYPE xaiEltwiseLessThan3D_AV(const xai_pTile3D inTile1,
+                                     const xai_pTile3D inTile2,
+                                     xai_pTile3D outTile)
+{
+  if (!inTile1 || !inTile2 || !outTile)
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8))
+  {
+    return(xaiEltwiseLessThan3D_S8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8))
+  {
+    return(xaiEltwiseLessThan3D_U8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16))
+  {
+    return(xaiEltwiseLessThan3D_S16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16))
+  {
+    return(xaiEltwiseLessThan3D_U16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32))
+  {
+    return(xaiEltwiseLessThan3D_S32_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32))
+  {
+    return(xaiEltwiseLessThan3D_U32_AV(inTile1, inTile2, outTile));
+  }
+
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F16))
+  {
+    return(xaiEltwiseLessThan3D_F16_AV(inTile1, inTile2, outTile));
+  }
+#endif
+
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F32))
+  {
+    return(xaiEltwiseLessThan3D_F32_AV(inTile1, inTile2, outTile));
+  }
+#endif
+
+  return(XAI_ERR_OK);
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_lessthan.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_lessthan.h
new file mode 100644
index 00000000000..aab6c89d183
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_lessthan.h
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn_common.h"
+
+
+#ifndef SIGNED8BIT
+#define SIGNED8BIT     1
+#define UNSIGNED8BIT   2
+#define SIGNED16BIT    3
+#define UNSIGNED16BIT  4
+#define SIGNED32BIT    5
+#define UNSIGNED32BIT  6
+#define FLOAT16BIT     7
+#define FLOAT32BIT     8
+#endif
+
+#if ELT_LESSTHAN_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S8
+#define MORPH_IDT_SCALAR  int8_t
+
+#elif ELT_LESSTHAN_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U8
+#define MORPH_IDT_SCALAR  uint8_t
+
+#elif ELT_LESSTHAN_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S16
+#define MORPH_IDT_SCALAR  int16_t
+
+#elif ELT_LESSTHAN_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U16
+#define MORPH_IDT_SCALAR  uint16_t
+
+#elif ELT_LESSTHAN_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S32
+#define MORPH_IDT_SCALAR  int32_t
+
+#elif ELT_LESSTHAN_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U32
+#define MORPH_IDT_SCALAR  uint32_t
+
+#elif ELT_LESSTHAN_DATA_TYPE == FLOAT16BIT
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _F16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_F16
+#define MORPH_IDT_SCALAR  xb_f16
+#endif
+
+#elif ELT_LESSTHAN_DATA_TYPE == FLOAT32BIT
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _F32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_F32
+#define MORPH_IDT_SCALAR  float
+#endif
+#endif
+
+#define LESS_THAN(a, b)  a < b
+
+
+/**************************** xaiEltwiseLessThan3D ***************************************/
+/* Description  : auto-vectorizable implementation of Broadcast elementWise LESS         */
+/*               operator, Based on MORPH implementation eight variants are              */
+/*               generated for S8, U8, S16, U16, S32, U32, F16 and F32 data types        */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE MAKE_NAME (xaiEltwiseLessThan3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile1);
+    MORPH_IDT_CHECK(inTile2);
+    MORPH_IDT_CHECK(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2),
+                    XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2));
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile),
+                    XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile));
+    XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1);
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1SizeOut   = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t dim2SizeOut   = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t dim3SizeOut   = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1);
+  MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2);
+  MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  MORPH_IDT_SCALAR *__restrict pIn1;
+  MORPH_IDT_SCALAR *__restrict pIn2;
+  MORPH_IDT_SCALAR *__restrict pOut;
+
+  /* Get Pitch appropriate for elementwise broadcast operations */
+  XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \
+                                inTile2Pitch1, inTile1Pitch2, inTile2Pitch2);
+
+  /* no Broadcast */
+  if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2)
+  {
+    int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut;
+    pIn1 = pInput1;
+    pIn2 = pInput2;
+    pOut = pOutput;
+
+    for (int i = 0; i < dimsCount; i++)
+    {
+#if (ELT_LESSTHAN_DATA_TYPE == FLOAT16BIT)
+      bool temp = LESS_THAN(pIn1[i], pIn2[i]);
+      pOut[i] = temp ? 1 : 0;
+#else
+      pOut[i] = LESS_THAN(pIn1[i], pIn2[i]);
+#endif
+    }
+  }
+  else
+  {
+    /*
+       inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting
+       inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting
+       inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting
+       inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting
+       inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting
+       inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting
+     */
+    int32_t y, z, idx;
+
+    for (z = 0; z < dim3SizeOut; z++)
+    {
+      MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2;
+      MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2;
+      MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2;
+
+      for (y = 0; y < dim2SizeOut; y++)
+      {
+        pIn1 = (temp1 + y * inTile1Pitch1);
+        pIn2 = (temp2 + y * inTile2Pitch1);
+        pOut = (temp3 + y * outTilePitch1);
+
+        MORPH_IDT_SCALAR InData1, InData2;
+        /* Tile1 Dimension 1 broadcasting */
+        if (inTile1Pitch0 == 0)
+        {
+          InData1 = pIn1[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData2 = pIn2[idx];
+#if (ELT_LESSTHAN_DATA_TYPE == FLOAT16BIT)
+            bool temp = LESS_THAN(InData1, InData2);
+            pOut[idx] = temp ? 1 : 0;
+#else
+            pOut[idx] = LESS_THAN(InData1, InData2);
+#endif
+          }
+        }
+        /* Tile2 Dimension 1 broadcasting */
+        else if (inTile2Pitch0 == 0)
+        {
+          InData2 = pIn2[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1 = pIn1[idx];
+#if (ELT_LESSTHAN_DATA_TYPE == FLOAT16BIT)
+            bool temp = LESS_THAN(InData1, InData2);
+            pOut[idx] = temp ? 1 : 0;
+#else
+            pOut[idx] = LESS_THAN(InData1, InData2);
+#endif
+          }
+        }
+        else
+        {
+          /* broadcast in dims 1 or 2 in Tile1 or TIle2 */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1 = pIn1[idx];
+            InData2 = pIn2[idx];
+#if (ELT_LESSTHAN_DATA_TYPE == FLOAT16BIT)
+            bool temp = LESS_THAN(InData1, InData2);
+            pOut[idx] = temp ? 1 : 0;
+#else
+            pOut[idx] = LESS_THAN(InData1, InData2);
+#endif
+          }
+        }
+      }
+    }
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_max.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_max.c
new file mode 100644
index 00000000000..aa62692b567
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_max.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn_common.h"
+
+
+#define ELTMAX_DATA_TYPE  SIGNED8BIT
+#include "cnn_eltwise_max.h"
+#undef ELTMAX_DATA_TYPE
+
+#define ELTMAX_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_eltwise_max.h"
+#undef ELTMAX_DATA_TYPE
+
+#define ELTMAX_DATA_TYPE  SIGNED16BIT
+#include "cnn_eltwise_max.h"
+#undef ELTMAX_DATA_TYPE
+
+#define ELTMAX_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_eltwise_max.h"
+#undef ELTMAX_DATA_TYPE
+
+#define ELTMAX_DATA_TYPE  SIGNED32BIT
+#include "cnn_eltwise_max.h"
+#undef ELTMAX_DATA_TYPE
+
+#define ELTMAX_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_eltwise_max.h"
+#undef ELTMAX_DATA_TYPE
+
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+#define ELTMAX_DATA_TYPE  FLOAT16BIT
+#include "cnn_eltwise_max.h"
+#undef ELTMAX_DATA_TYPE
+#endif
+
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+#define ELTMAX_DATA_TYPE  FLOAT32BIT
+#include "cnn_eltwise_max.h"
+#undef ELTMAX_DATA_TYPE
+#endif
+
+
+/**************************** xaiEltwiseMax3D_AV ***************************************/
+/* Description  : General API for auto-vectorizable Broadcast element-wise             */
+/*                MAX operator                                                         */
+/*                Calls one of the xaiEltwiseMax3D_AV functions based on the data type */
+/* Inputs       : inTile1, inTile2                                                     */
+/* Outputs      : XI Error Code                                                        */
+/* InOuts       : outTile                                                              */
+/***************************************************************************************/
+
+XAI_ERR_TYPE xaiEltwiseMax3D_AV(const xai_pTile3D inTile1,
+                                const xai_pTile3D inTile2,
+                                xai_pTile3D outTile)
+{
+  if (!inTile1 || !inTile2 || !outTile)
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8))
+  {
+    return(xaiEltwiseMax3D_S8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8))
+  {
+    return(xaiEltwiseMax3D_U8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16))
+  {
+    return(xaiEltwiseMax3D_S16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16))
+  {
+    return(xaiEltwiseMax3D_U16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32))
+  {
+    return(xaiEltwiseMax3D_S32_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32))
+  {
+    return(xaiEltwiseMax3D_U32_AV(inTile1, inTile2, outTile));
+  }
+
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F16))
+  {
+    return(xaiEltwiseMax3D_F16_AV(inTile1, inTile2, outTile));
+  }
+#endif
+
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F32))
+  {
+    return(xaiEltwiseMax3D_F32_AV(inTile1, inTile2, outTile));
+  }
+#endif
+
+  return(XAI_ERR_OK);
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_max.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_max.h
new file mode 100644
index 00000000000..f87622c40e3
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_max.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn_common.h"
+
+
+#ifndef SIGNED8BIT
+#define SIGNED8BIT     1
+#define UNSIGNED8BIT   2
+#define SIGNED16BIT    3
+#define UNSIGNED16BIT  4
+#define SIGNED32BIT    5
+#define UNSIGNED32BIT  6
+#define FLOAT16BIT     7
+#define FLOAT32BIT     8
+#endif
+
+#if ELTMAX_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S8
+#define MORPH_IDT_SCALAR  int8_t
+
+#elif ELTMAX_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U8
+#define MORPH_IDT_SCALAR  uint8_t
+
+#elif ELTMAX_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S16
+#define MORPH_IDT_SCALAR  int16_t
+
+#elif ELTMAX_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U16
+#define MORPH_IDT_SCALAR  uint16_t
+
+#elif ELTMAX_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S32
+#define MORPH_IDT_SCALAR  int32_t
+
+#elif ELTMAX_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U32
+#define MORPH_IDT_SCALAR  uint32_t
+
+#elif ELTMAX_DATA_TYPE == FLOAT16BIT
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _F16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_F16
+#define MORPH_IDT_SCALAR  xb_f16
+#endif
+
+#elif ELTMAX_DATA_TYPE == FLOAT32BIT
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _F32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_F32
+#define MORPH_IDT_SCALAR  float
+#endif
+#endif
+
+
+/**************************** xaiEltwiseMax3D ********************************************/
+/* Description  : auto-vectorizable implementation of Broadcast elementWise MAX          */
+/*               operator, Based on MORPH implementation eight variants are              */
+/*               generated for S8, U8, S16, U16, S32, U32, F16 and F32 data types        */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE MAKE_NAME (xaiEltwiseMax3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile1);
+    MORPH_IDT_CHECK(inTile2);
+    MORPH_IDT_CHECK(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2),
+                    XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2));
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile),
+                    XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile));
+    XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1);
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1SizeOut   = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t dim2SizeOut   = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t dim3SizeOut   = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1);
+  MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2);
+  MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  MORPH_IDT_SCALAR *__restrict pIn1;
+  MORPH_IDT_SCALAR *__restrict pIn2;
+  MORPH_IDT_SCALAR *__restrict pOut;
+
+  /* Get Pitch appropriate for elementwise broadcast operations */
+  XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \
+                                inTile2Pitch1, inTile1Pitch2, inTile2Pitch2);
+
+  /* no Broadcast */
+  if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2)
+  {
+    int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut;
+    pIn1 = pInput1;
+    pIn2 = pInput2;
+    pOut = pOutput;
+
+    for (int i = 0; i < dimsCount; i++)
+    {
+      pOut[i] = MAX2(pIn1[i], pIn2[i]);
+    }
+  }
+  else
+  {
+    /*
+       inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting
+       inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting
+       inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting
+       inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting
+       inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting
+       inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting
+     */
+    int32_t y, z, idx;
+
+    for (z = 0; z < dim3SizeOut; z++)
+    {
+      MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2;
+      MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2;
+      MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2;
+
+      for (y = 0; y < dim2SizeOut; y++)
+      {
+        pIn1 = (temp1 + y * inTile1Pitch1);
+        pIn2 = (temp2 + y * inTile2Pitch1);
+        pOut = (temp3 + y * outTilePitch1);
+
+        MORPH_IDT_SCALAR InData1, InData2;
+        /* Tile1 Dimension 1 broadcasting */
+        if (inTile1Pitch0 == 0)
+        {
+          InData1 = pIn1[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData2   = pIn2[idx];
+            pOut[idx] = MAX2(InData1, InData2);
+          }
+        }
+        /* Tile2 Dimension 1 broadcasting */
+        else if (inTile2Pitch0 == 0)
+        {
+          InData2 = pIn2[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1   = pIn1[idx];
+            pOut[idx] = MAX2(InData1, InData2);
+          }
+        }
+        else
+        {
+          /* broadcast in dims 1 or 2 in Tile1 or TIle2 */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1   = pIn1[idx];
+            InData2   = pIn2[idx];
+            pOut[idx] = MAX2(InData1, InData2);
+          }
+        }
+      }
+    }
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_min.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_min.c
new file mode 100644
index 00000000000..60b4d1e0523
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_min.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn_common.h"
+
+
+#define ELTMIN_DATA_TYPE  SIGNED8BIT
+#include "cnn_eltwise_min.h"
+#undef ELTMIN_DATA_TYPE
+
+#define ELTMIN_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_eltwise_min.h"
+#undef ELTMIN_DATA_TYPE
+
+#define ELTMIN_DATA_TYPE  SIGNED16BIT
+#include "cnn_eltwise_min.h"
+#undef ELTMIN_DATA_TYPE
+
+#define ELTMIN_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_eltwise_min.h"
+#undef ELTMIN_DATA_TYPE
+
+#define ELTMIN_DATA_TYPE  SIGNED32BIT
+#include "cnn_eltwise_min.h"
+#undef ELTMIN_DATA_TYPE
+
+#define ELTMIN_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_eltwise_min.h"
+#undef ELTMIN_DATA_TYPE
+
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+#define ELTMIN_DATA_TYPE  FLOAT16BIT
+#include "cnn_eltwise_min.h"
+#undef ELTMIN_DATA_TYPE
+#endif
+
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+#define ELTMIN_DATA_TYPE  FLOAT32BIT
+#include "cnn_eltwise_min.h"
+#undef ELTMIN_DATA_TYPE
+#endif
+
+
+/**************************** xaiEltwiseMin3D_AV *****************************************/
+/* Description  : General API for auto-vectorizable Broadcast element-wise               */
+/*                MIN operator                                                           */
+/*                Calls one of the xaiEltwiseMin3D_AV functions based on the data type   */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE xaiEltwiseMin3D_AV(const xai_pTile3D inTile1,
+                                const xai_pTile3D inTile2,
+                                xai_pTile3D outTile)
+{
+  if (!inTile1 || !inTile2 || !outTile)
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8))
+  {
+    return(xaiEltwiseMin3D_S8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8))
+  {
+    return(xaiEltwiseMin3D_U8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16))
+  {
+    return(xaiEltwiseMin3D_S16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16))
+  {
+    return(xaiEltwiseMin3D_U16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32))
+  {
+    return(xaiEltwiseMin3D_S32_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32))
+  {
+    return(xaiEltwiseMin3D_U32_AV(inTile1, inTile2, outTile));
+  }
+
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F16))
+  {
+    return(xaiEltwiseMin3D_F16_AV(inTile1, inTile2, outTile));
+  }
+#endif
+
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F32))
+  {
+    return(xaiEltwiseMin3D_F32_AV(inTile1, inTile2, outTile));
+  }
+#endif
+
+  return(XAI_ERR_OK);
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_min.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_min.h
new file mode 100644
index 00000000000..520272cce9d
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_min.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn_common.h"
+
+
+#ifndef SIGNED8BIT
+#define SIGNED8BIT     1
+#define UNSIGNED8BIT   2
+#define SIGNED16BIT    3
+#define UNSIGNED16BIT  4
+#define SIGNED32BIT    5
+#define UNSIGNED32BIT  6
+#define FLOAT16BIT     7
+#define FLOAT32BIT     8
+#endif
+
+#if ELTMIN_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S8
+#define MORPH_IDT_SCALAR  int8_t
+
+#elif ELTMIN_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U8
+#define MORPH_IDT_SCALAR  uint8_t
+
+#elif ELTMIN_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S16
+#define MORPH_IDT_SCALAR  int16_t
+
+#elif ELTMIN_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U16
+#define MORPH_IDT_SCALAR  uint16_t
+
+#elif ELTMIN_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S32
+#define MORPH_IDT_SCALAR  int32_t
+
+#elif ELTMIN_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U32
+#define MORPH_IDT_SCALAR  uint32_t
+
+#elif ELTMIN_DATA_TYPE == FLOAT16BIT
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _F16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_F16
+#define MORPH_IDT_SCALAR  xb_f16
+#endif
+
+#elif ELTMIN_DATA_TYPE == FLOAT32BIT
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _F32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_F32
+#define MORPH_IDT_SCALAR  float
+#endif
+#endif
+
+
+/**************************** xaiEltwiseMin3D ********************************************/
+/* Description  : auto-vectorizable implementation of Broadcast elementWise MIN          */
+/*               operator, Based on MORPH implementation eight variants are              */
+/*               generated for S8, U8, S16, U16, S32, U32, F16 and F32 data types        */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE MAKE_NAME (xaiEltwiseMin3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile1);
+    MORPH_IDT_CHECK(inTile2);
+    MORPH_IDT_CHECK(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2),
+                    XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2));
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile),
+                    XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile));
+    XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1);
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1SizeOut   = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t dim2SizeOut   = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t dim3SizeOut   = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1);
+  MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2);
+  MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  MORPH_IDT_SCALAR *__restrict pIn1;
+  MORPH_IDT_SCALAR *__restrict pIn2;
+  MORPH_IDT_SCALAR *__restrict pOut;
+
+  /* Get Pitch appropriate for elementwise broadcast operations */
+  XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \
+                                inTile2Pitch1, inTile1Pitch2, inTile2Pitch2);
+
+  /* no Broadcast */
+  if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2)
+  {
+    int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut;
+    pIn1 = pInput1;
+    pIn2 = pInput2;
+    pOut = pOutput;
+
+    for (int i = 0; i < dimsCount; i++)
+    {
+      pOut[i] = MIN2(pIn1[i], pIn2[i]);
+    }
+  }
+  else
+  {
+    /*
+       inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting
+       inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting
+       inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting
+       inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting
+       inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting
+       inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting
+     */
+    int32_t y, z, idx;
+
+    for (z = 0; z < dim3SizeOut; z++)
+    {
+      MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2;
+      MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2;
+      MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2;
+
+      for (y = 0; y < dim2SizeOut; y++)
+      {
+        pIn1 = (temp1 + y * inTile1Pitch1);
+        pIn2 = (temp2 + y * inTile2Pitch1);
+        pOut = (temp3 + y * outTilePitch1);
+
+        MORPH_IDT_SCALAR InData1, InData2;
+        /* Tile1 Dimension 1 broadcasting */
+        if (inTile1Pitch0 == 0)
+        {
+          InData1 = pIn1[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData2   = pIn2[idx];
+            pOut[idx] = MIN2(InData1, InData2);
+          }
+        }
+        /* Tile2 Dimension 1 broadcasting */
+        else if (inTile2Pitch0 == 0)
+        {
+          InData2 = pIn2[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1   = pIn1[idx];
+            pOut[idx] = MIN2(InData1, InData2);
+          }
+        }
+        else
+        {
+          /* broadcast in dims 1 or 2 in Tile1 or TIle2 */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1   = pIn1[idx];
+            InData2   = pIn2[idx];
+            pOut[idx] = MIN2(InData1, InData2);
+          }
+        }
+      }
+    }
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_mul_S32.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_mul_S32.c
new file mode 100644
index 00000000000..432f4b80f96
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_mul_S32.c
@@ -0,0 +1,570 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn_common.h"
+#if XCHAL_HAVE_VISION              //build only on VISION dsps
+/******************************** eltwiseMul_BroadCastDims1_j1 ********************************/
+/* Description : Optimized implementation of Broadcast Elementwise Multiplication             */
+/*               functionality across first dimension.                                        */
+/* Inputs      : inTile1, inTile2, param, pitch values                                        */
+/* Outputs     : XI Error Code                                                                */
+/* InOuts      : Both InTiles and outTile is signed 32bit                                     */
+/* Assumptions : While performing element wise multiplication of two input tiles, edge        */
+/*               data is ignored                                                              */
+/**********************************************************************************************/
+static _XAI_INLINE_ void eltwiseMulS32_BroadCastDims1_AV(const xai_pTile3D inTile1,
+                                                               const xai_pTile3D inTile2,
+                                                               xai_pTile3D outTile,
+                                                               int32_t inTile1Pitch0,
+                                                               int32_t inTile2Pitch0,
+                                                               int32_t inTile1Pitch1,
+                                                               int32_t inTile2Pitch1,
+                                                               int32_t inTile1Pitch2,
+                                                               int32_t inTile2Pitch2)
+{
+  /* Get Tile Parameters */
+  const int32_t dim1SizeOut   = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t dim2SizeOut   = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t dim3SizeOut   = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  int32_t *pInput1 = (int32_t *) XAI_TILE3D_GET_DATA_PTR(inTile1);
+  int32_t *pInput2 = (int32_t *) XAI_TILE3D_GET_DATA_PTR(inTile2);
+  int32_t *pOutput = (int32_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  /* input and output pointers */
+  int32_t *restrict outPtr1;
+  int32_t *restrict inp1Ptr;
+  int32_t *restrict inp2Ptr;
+
+  int32_t *restrict outPtr_z;
+  int32_t *restrict inp1Ptr_z;
+  int32_t *restrict inp2Ptr_z;
+
+  // Outer Most Loop Pitch Variables
+  int32_t oOutPitch = outTilePitch2;
+  int32_t oIn1Pitch = inTile1Pitch2;
+  int32_t oIn2Pitch = inTile2Pitch2;
+
+  // Middle Loop Pitch Variables
+  int32_t mOutPitch = outTilePitch1;
+  int32_t mIn1Pitch = inTile1Pitch1;
+  int32_t mIn2Pitch = inTile2Pitch1;
+
+  int32_t innerMostLoopCnt = dim1SizeOut;
+  int32_t middleLoopCnt    = dim2SizeOut;
+  int32_t outerMostLoopCnt = dim3SizeOut;
+
+  if (((inTile2Pitch1 == 0) && (inTile2Pitch2 == 0) &&                                      \
+       (dim2SizeOut * inTile1Pitch1 == inTile1Pitch2) && (dim1SizeOut == inTile1Pitch1) &&  \
+       (dim2SizeOut * outTilePitch1 == outTilePitch2) && (dim1SizeOut == outTilePitch1)) || \
+      ((inTile1Pitch1 == 0) && (inTile1Pitch2 == 0) &&                                      \
+       (dim2SizeOut * inTile2Pitch1 == inTile2Pitch2) && (dim1SizeOut == inTile2Pitch1) &&  \
+       (dim2SizeOut * outTilePitch1 == outTilePitch2) && (dim1SizeOut == outTilePitch1)))
+  {
+    innerMostLoopCnt = dim1SizeOut * dim2SizeOut * dim3SizeOut;
+    middleLoopCnt    = 1;
+    outerMostLoopCnt = 1;
+
+    /* Middle Loop Pitch Variables */
+    mIn1Pitch = 0;
+    mIn2Pitch = 0;
+    mOutPitch = 0;
+
+    /* Outer Most Loop Pitch Variables */
+    oOutPitch = 0;
+    oIn1Pitch = 0;
+    oIn2Pitch = 0;
+  }
+  else if ((inTile2Pitch1 == 0 && dim1SizeOut == inTile1Pitch1 && dim1SizeOut == outTilePitch1) || \
+           (inTile1Pitch1 == 0 && dim1SizeOut == inTile2Pitch1 && dim1SizeOut == outTilePitch1))
+  {
+    innerMostLoopCnt = dim1SizeOut * dim2SizeOut;
+    middleLoopCnt    = dim3SizeOut;
+    outerMostLoopCnt = 1;
+
+    /* Middle Loop Pitch Variables */
+    mOutPitch = outTilePitch2;
+    mIn1Pitch = inTile1Pitch2;
+    mIn2Pitch = inTile2Pitch2;
+
+    /* Outer Most Loop Pitch Variables */
+    oOutPitch = 0;
+    oIn1Pitch = 0;
+    oIn2Pitch = 0;
+  }
+
+#if defined(IVP_MULN_2X32) || (XCHAL_HAVE_HIFI1 || XCHAL_HAVE_HIFI3Z || XCHAL_HAVE_HIFI4 || XCHAL_HAVE_HIFI5 || (XCHAL_HAVE_BBENEP == 1)) /*Auto vectorization is done only if S32 mul ISA is available*/
+/*Adding KQ8 conditionalization also, as KQ8 doesn't have S32 mul support direct or indirect. Therefore, for KQ8 Auto vec attempt shall fail and plain scalar C code shall be used.*/
+  /* Tile1 Dimension 1 broadcasting */
+  if (inTile1Pitch0 == 0)
+  {
+      	  // This loop process dim1, dim2, dim3 in the same order from innermost
+    for (z = 0; z < outerMostLoopCnt; z++)
+    {
+      outPtr_z  = (int32_t *) (pOutput + z * oOutPitch);
+      inp1Ptr_z = (int32_t *) (pInput1 + z * oIn1Pitch);
+      inp2Ptr_z = (int32_t *) (pInput2 + z * oIn2Pitch);
+
+      for (y = 0; y < middleLoopCnt; y++)
+      {
+        outPtr1 = (int32_t *) (outPtr_z + y * mOutPitch);
+        inp1Ptr = (int32_t *) (inp1Ptr_z + y * mIn1Pitch);
+        inp2Ptr = (int32_t *) (inp2Ptr_z + y * mIn2Pitch);
+
+        /* Load Input 1 */
+        int32_t InData1 = inp1Ptr[0];
+
+        for (x = 0; x < innerMostLoopCnt; x++)
+        {
+          int32_t InData2   = inp2Ptr[x];
+          outPtr1[x] = InData1 * InData2;
+        }
+      }
+    }
+  }
+  /* Tile2 Dimension 1 broadcasting */
+  else if (inTile2Pitch0 == 0)
+  {
+    // This loop process dim1, dim2, dim3 in the same order from innermost
+    for (z = 0; z < outerMostLoopCnt; z++)
+    {
+      outPtr_z  = (int32_t *) (pOutput + z * oOutPitch);
+      inp1Ptr_z = (int32_t *) (pInput1 + z * oIn1Pitch);
+      inp2Ptr_z = (int32_t *) (pInput2 + z * oIn2Pitch);
+
+      for (y = 0; y < middleLoopCnt; y++)
+      {
+        outPtr1 = (int32_t *) (outPtr_z + y * mOutPitch);
+        inp1Ptr = (int32_t *) (inp1Ptr_z + y * mIn1Pitch);
+        inp2Ptr = (int32_t *) (inp2Ptr_z + y * mIn2Pitch);
+
+        /* Load Input 2 */
+        int32_t InData2 = inp2Ptr[0];
+
+        for (x = 0; x < innerMostLoopCnt; x++)
+        {
+          int32_t InData1   = inp1Ptr[x];
+          outPtr1[x] = InData1 * InData2;
+        }
+      }
+    }
+  }
+#else
+  xb_vecN_2x32v  * restrict pvecIn1;
+  xb_vecN_2x32v  * restrict pvecIn2;
+  xb_vecN_2x32v * restrict pvecOut;
+
+  xb_vecN_2x32v vecInData1;  /* 1st input tile */
+  xb_vecN_2x32v vecInData2;  /* 2nd input tile*/
+
+  const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH >> 1;
+  valign vaOutData = IVP_ZALIGN();
+
+  /* Tile1 Dimension 1 broadcasting */
+  if (inTile1Pitch0 == 0)
+  {
+    // This loop process dim1, dim2, dim3 in the same order from innermost
+    for (z = 0; z < outerMostLoopCnt; z++)
+    {
+      outPtr_z  = (int32_t *) (pOutput + z * oOutPitch);
+      inp1Ptr_z = (int32_t *) (pInput1 + z * oIn1Pitch);
+      inp2Ptr_z = (int32_t *) (pInput2 + z * oIn2Pitch);
+
+      for (y = 0; y < middleLoopCnt; y++)
+      {
+        outPtr1 = (int32_t *) (outPtr_z + y * mOutPitch);
+        inp1Ptr = (int32_t *) (inp1Ptr_z + y * mIn1Pitch);
+        inp2Ptr = (int32_t *) (inp2Ptr_z + y * mIn2Pitch);
+
+        /* Vector and pointer of Input 2 and output to load and store values */
+        pvecIn2 = (xb_vecN_2x32v *) (inp2Ptr);
+        valign vaInData2 = IVP_LAN_2X32_PP(pvecIn2);
+
+        pvecOut = (xb_vecN_2x32v *) (outPtr1);
+
+        /* Load Input 1 */
+        vecInData1 = (xb_vecN_2x32v) (inp1Ptr[0]);
+        for (x = 0; x < innerMostLoopCnt; x += vectorizationWidth)
+        {
+          /* Vector and pointer of Input 2 and output to load and store values */
+          IVP_LAVN_2X32_XP(vecInData2, vaInData2, pvecIn2, (innerMostLoopCnt - x) * 4);
+
+          /* populate wide vectors with product of inputs */
+          xb_vecN_2x64w wvecAcc;
+          wvecAcc = IVP_MULUSN_2X16X32_0(IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(vecInData2)), vecInData1);
+          IVP_MULAHN_2X16X32_1(wvecAcc, IVP_MOVNX16_FROMN_2X32(vecInData2), vecInData1);
+
+          /* truncate the multiply result in wide vector into 32 bit format*/
+          xb_vecN_2x32v vecOutData = IVP_PACKLN_2X64W(wvecAcc);
+  
+          IVP_SAVN_2X32_XP(vecOutData, vaOutData, pvecOut, (innerMostLoopCnt - x) * 4);
+        }
+        IVP_SAPOSN_2X32_FP(vaOutData, pvecOut);
+      }
+    }
+  }
+  else if (inTile2Pitch0 == 0)
+  {
+    // This loop process dim1, dim2, dim3 in the same order from innermost
+    for (z = 0; z < outerMostLoopCnt; z++)
+    {
+      outPtr_z  = (int32_t *) (pOutput + z * oOutPitch);
+      inp1Ptr_z = (int32_t *) (pInput1 + z * oIn1Pitch);
+      inp2Ptr_z = (int32_t *) (pInput2 + z * oIn2Pitch);
+
+      for (y = 0; y < middleLoopCnt; y++)
+      {
+        outPtr1 = (int32_t *) (outPtr_z + y * mOutPitch);
+        inp1Ptr = (int32_t *) (inp1Ptr_z + y * mIn1Pitch);
+        inp2Ptr = (int32_t *) (inp2Ptr_z + y * mIn2Pitch);
+
+        /* Vector and pointer of Input 1 and output to load and store values */
+        pvecIn1 = (xb_vecN_2x32v *) (inp1Ptr);
+        valign vaInData1 = IVP_LAN_2X32_PP(pvecIn1);
+
+        pvecOut = (xb_vecN_2x32v *) (outPtr1);
+
+        /* Load Input 2 */
+        vecInData2 = (xb_vecN_2x32v) (inp2Ptr[0]);
+        for (x = 0; x < innerMostLoopCnt; x += vectorizationWidth)
+        {
+          /* load input data from 2nd tile, input data pointer is post incremented by varlen by the load instruction */
+          IVP_LAVN_2X32_XP(vecInData1, vaInData1, pvecIn1, (innerMostLoopCnt - x) * 4);
+
+          /* populate wide vectors with product of inputs */
+          xb_vecN_2x64w wvecAcc;
+          wvecAcc = IVP_MULUSN_2X16X32_0(IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(vecInData2)), vecInData1);
+          IVP_MULAHN_2X16X32_1(wvecAcc, IVP_MOVNX16_FROMN_2X32(vecInData2), vecInData1);
+
+          /* truncate the multiply result in wide vector into 32 bit format*/
+          xb_vecN_2x32v vecOutData = IVP_PACKLN_2X64W(wvecAcc);
+  
+          IVP_SAVN_2X32_XP(vecOutData, vaOutData, pvecOut, (innerMostLoopCnt - x) * 4);
+        }
+        IVP_SAPOSN_2X32_FP(vaOutData, pvecOut);
+      }
+    }
+  }
+#endif
+}
+
+/**************************** xaiEltwiseMul3D ********************************************/
+/* Description  : auto-vectorizable implementation of element-wise S32 multiplication    */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE xaiEltwiseMul3D_S32_AV(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    XAI_CHECK_TILE3D_S32(inTile1);
+    XAI_CHECK_TILE3D_S32(inTile2);
+    XAI_CHECK_TILE3D_S32(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2),
+                    XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2));
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile),
+                    XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile));
+    XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1);
+  }
+
+  /* Get Tile Parameters */
+  const int32_t inTile1dim1Size = XAI_TILE3D_GET_DIM1(inTile1);
+  const int32_t inTile2dim1Size = XAI_TILE3D_GET_DIM1(inTile2);
+  const int32_t inTile1dim2Size = XAI_TILE3D_GET_DIM2(inTile1);
+  const int32_t inTile2dim2Size = XAI_TILE3D_GET_DIM2(inTile2);
+  const int32_t inTile1dim3Size = XAI_TILE3D_GET_DIM3(inTile1);
+  const int32_t inTile2dim3Size = XAI_TILE3D_GET_DIM3(inTile2);
+  const int32_t dim1SizeOut   = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t dim2SizeOut   = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t dim3SizeOut   = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  int32_t *pInput1 = (int32_t *) XAI_TILE3D_GET_DATA_PTR(inTile1);
+  int32_t *pInput2 = (int32_t *) XAI_TILE3D_GET_DATA_PTR(inTile2);
+  int32_t *pOutput = (int32_t *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  /* broadcast flag is set in case of dimension sizes mismatch of inTile1 and inTile2 */
+  /* If broadcast flag is set, only the generalized variant is used, even if edges are absent */
+  int32_t bcastFlag = 0;
+  if (!((inTile1dim1Size == inTile2dim1Size) && (inTile1dim2Size == inTile2dim2Size) && (inTile1dim3Size == inTile2dim3Size)))
+  {
+    bcastFlag = 1;
+  }
+
+  int32_t is_2D = ((outTilePitch1 == dim1SizeOut) && (XAI_TILE3D_GET_DIM1_PITCH(inTile1) == dim1SizeOut) && (XAI_TILE3D_GET_DIM1_PITCH(inTile2) == dim1SizeOut)) ? 1 : 0;
+  int32_t is_1D = ((outTilePitch2 == (dim1SizeOut * dim2SizeOut)) && (XAI_TILE3D_GET_DIM2_PITCH(inTile1) == (dim1SizeOut * dim2SizeOut)) && (XAI_TILE3D_GET_DIM2_PITCH(inTile2) == (dim1SizeOut * dim2SizeOut))) ? 1 : 0;
+
+  /* Get Pitch appropriate for elementwise broadcast operations */
+  XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \
+                                inTile2Pitch1, inTile1Pitch2, inTile2Pitch2);
+
+  if ((inTile1dim1Size == 1 || inTile2dim1Size == 1) && (!(inTile1dim1Size == inTile2dim1Size)))
+  {
+    eltwiseMulS32_BroadCastDims1_AV(inTile1, inTile2, outTile, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \
+                                          inTile2Pitch1, inTile1Pitch2, inTile2Pitch2);
+  }
+  else
+  {
+#if defined(IVP_MULN_2X32) || (XCHAL_HAVE_HIFI1 || XCHAL_HAVE_HIFI3Z || XCHAL_HAVE_HIFI4 || XCHAL_HAVE_HIFI5 || (XCHAL_HAVE_BBENEP == 1)) /*Auto vectorization is done only if S32 mul ISA is available*/ 
+/*Adding KQ8 conditionalization also, as KQ8 doesn't have S32 mul support direct or indirect. Therefore, for KQ8 Auto vec attempt shall fail and plain scalar C code shall be used.*/
+  int32_t *__restrict pIn1;
+  int32_t *__restrict pIn2;
+  int32_t *__restrict pOut;
+
+  /* Overall design approach is split in 2 sections depending on the optimal
+   * tile sizes. When the edge length along dimension1 is zero, loops across
+   * dimension1 and dimension2 can be merged.
+   */
+
+  /* check for optimal tile size i.e edge length along dimension1 is zero */
+  if (is_2D && (!bcastFlag))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3SizeOut;
+    int32_t maxLoopCount     = dim1SizeOut * dim2SizeOut;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if (is_1D)
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+      dim3MaxLoopCount = 1;       /* Update max loop counter */
+      maxLoopCount    *= dim3SizeOut;
+    }
+    for (int j = 0; j < dim3MaxLoopCount; j++)
+    {
+      pIn1 = pInput1 + j * inTile1Pitch2;
+      pIn2 = pInput2 + j * inTile2Pitch2;
+      pOut = pOutput + j * outTilePitch2;
+      for(int i = 0; i < maxLoopCount; i++)
+      {
+        pOut[i] = (int32_t)(pIn1[i] * pIn2[i]);
+      }
+    }
+  }
+  else
+  {
+    int32_t y, z, idx;
+
+    for (z = 0; z < dim3SizeOut; z++)
+    {
+      int32_t* temp1 = pInput1 + z * inTile1Pitch2;
+      int32_t* temp2 = pInput2 + z * inTile2Pitch2;
+      int32_t* temp3 = pOutput + z * outTilePitch2;
+
+      for (y = 0; y < dim2SizeOut; y++)
+      {
+        pIn1 = (temp1 + y * inTile1Pitch1);
+        pIn2 = (temp2 + y * inTile2Pitch1);
+        pOut = (temp3 + y * outTilePitch1);
+
+        int32_t InData1, InData2;
+        for (idx = 0; idx < dim1SizeOut; idx++)
+        {
+          InData1   = pIn1[idx];
+          InData2   = pIn2[idx];
+          pOut[idx] = (int32_t) (InData1 * InData2);
+        }
+      }
+    }
+  }
+#else
+  /* Following code is written for P6/P1 as they don't support S32 MUL. As mentioned, it is used when 32b MUL ISA is not available. */
+  /* However, P1 has a proto defined for S32 MUL, internally using 32x16 MUL only.             */
+  /* Therefore for P1, the above scalar code shall be used which shall be not auto vectorized, */
+  /* as compiler cannot find a direct 32b MUL ISA in P1.                                       */
+  /* Therefore, P1 shall give a low performance for this API.                                  */
+
+  /* input and output pointers */
+  xb_vecN_2x32v * restrict pvecIn1;
+  xb_vecN_2x32v * restrict pvecIn2;
+  xb_vecN_2x32v * restrict pdvecOut;
+
+  /* loop variables */
+  int32_t x, y, z;
+
+  int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH >> 1;
+
+  valign vaOutData = IVP_ZALIGN();
+
+  /* Overall design approach is split in 2 sections depending on the optimal
+   * tile sizes. When the edge length along dimension1 is zero, loops across
+   * dimension1 and dimension2 can be merged.
+   */
+
+  /* check for optimal tile size i.e edge length along dimension1 is zero */
+  if (is_2D && (!bcastFlag))
+  {
+    /******************************************************************************/
+    /* Data exist in contiguous memory location with respect to first dimension   */
+    /******************************************************************************/
+
+    /* Initialize max loop counter */
+    int32_t dim3MaxLoopCount = dim3SizeOut;
+    int32_t maxLoopCount     = dim1SizeOut * dim2SizeOut;
+
+    /* Updated Loop count based on tile dimension configuration */
+    if (is_1D)
+    {
+      /**********************************************************************/
+      /* Data exist in contiguous memory location with respect to first and */
+      /* second dimension                                                   */
+      /**********************************************************************/
+      dim3MaxLoopCount = 1;       /* Update max loop counter */
+      maxLoopCount    *= dim3SizeOut;
+    }
+    for (z = 0; z < dim3MaxLoopCount; z++)
+    {
+      pvecIn1 = (xb_vecN_2x32v *) &pInput1[z * inTile1Pitch2];
+      valign vaInData1 = IVP_LAN_2X32_PP (pvecIn1);
+
+      pvecIn2 = (xb_vecN_2x32v *) &pInput2[z * inTile2Pitch2];
+      valign vaInData2 = IVP_LAN_2X32_PP (pvecIn2);
+
+      pdvecOut = (xb_vecN_2x32v *) &pOutput[z * outTilePitch2];
+
+      /* loop across dimension1, dimension2 and dimension3 is combined */
+      for (x = 0; x <= maxLoopCount - vectorizationWidth; x += vectorizationWidth)
+      {
+        /* input data vectors */
+        xb_vecN_2x32v vecInData1;  /* first input tile */
+        xb_vecN_2x32v vecInData2;  /* 2nd input tile*/
+
+        /* load input data from 1st tile, input data pointer is post incremented
+         * implicitly by SIMD/2 by the load instruction */
+        IVP_LAN_2X32_IP(vecInData1, vaInData1, pvecIn1);
+
+        /* load input data from 2nd tile, input data pointer is post incremented
+         * implicitly by SIMD/2 by the load instruction */
+        IVP_LAN_2X32_IP(vecInData2, vaInData2, pvecIn2);
+
+        /* populate wide vectors with product of inputs */
+        xb_vecN_2x64w acc1;
+        acc1 = IVP_MULUSN_2X16X32_0(IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(vecInData2)), vecInData1);
+        IVP_MULAHN_2X16X32_1(acc1, IVP_MOVNX16_FROMN_2X32(vecInData2), vecInData1);
+        
+        /* truncate the multiply result in wide vector into 32 bit format*/
+        xb_vecN_2x32v vecOutData = IVP_PACKLN_2X64W(acc1);
+
+        IVP_SAVN_2X32_XP(vecOutData, vaOutData, pdvecOut, vectorizationWidth * 4);
+      } /* end of for (x = 0; x <= maxLoopCount - vectorizationWidth; x += vectorizationWidth) */
+
+      if (x < maxLoopCount)
+      {
+        /* input data vectors */
+        xb_vecN_2x32v vecInData1;  /* 1st input tile */
+        xb_vecN_2x32v vecInData2;  /* 2nd input tile*/
+
+        /* variable store count for output */
+        int32_t varLen = (maxLoopCount - x) * 4;
+
+        /* load input data from 1st tile, input data pointer is post incremented by varLen, by the load instruction */
+        IVP_LAVN_2X32_XP(vecInData1, vaInData1, pvecIn1, varLen);
+
+        /* load input data from 2nd tile, input data pointer is post incremented by varLen, by the load instruction */
+        IVP_LAVN_2X32_XP(vecInData2, vaInData2, pvecIn2, varLen);
+
+        /* populate wide vectors with product of inputs */
+        xb_vecN_2x64w acc1;
+        acc1 = IVP_MULUSN_2X16X32_0(IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(vecInData2)), vecInData1);
+        IVP_MULAHN_2X16X32_1(acc1, IVP_MOVNX16_FROMN_2X32(vecInData2), vecInData1);
+
+        /* truncate the multiply result in wide vector into 32 bit format*/
+        xb_vecN_2x32v vecOutData = IVP_PACKLN_2X64W(acc1);
+        IVP_SAVN_2X32_XP(vecOutData, vaOutData, pdvecOut, varLen);
+      } /*end of if (x < maxLoopCount)*/
+      IVP_SAPOSN_2X32_FP(vaOutData, pdvecOut);
+    } /* end of for (z = 0; z < dim3MaxLoopCount; z++) */
+  }   /* end of if ((inTile1Pitch1 == dim1SizeOut) && (inTile2Pitch1 == dim1SizeOut) && (outTilePitch1 == dim1SizeOut)) */
+  /* Handle cases with edges and/or broadcast along dim2/3 */
+  else
+  {
+    for (x = 0; x < dim1SizeOut; x += vectorizationWidth) /* along 1st dimension */
+    {
+      /* variable store count for output */
+      int32_t varLen = (dim1SizeOut - x) * 4;
+
+      for (z = 0; z < dim3SizeOut; z++)   /* along 3rd dimension */
+      {
+        int32_t * pIn1 = &pInput1[z * inTile1Pitch2 + x];
+
+        int32_t * pIn2 = &pInput2[z * inTile2Pitch2 + x];
+        /* pointer for 1st tile */
+        pvecIn1 = (xb_vecN_2x32v *) pIn1;
+
+        /* pointer for 2nd tile */
+        pvecIn2 = (xb_vecN_2x32v *) pIn2;
+
+        int32_t * pOut = &pOutput[z * outTilePitch2 + x];
+
+        for (y = 0; y < dim2SizeOut; y++) /* along 2nd dimension */
+        {
+          /* input data vectors */
+          /* 1st input tile */
+          xb_vecN_2x32v vecInData1;   
+
+          /* 2nd input tile */
+          xb_vecN_2x32v vecInData2;
+
+          /* load input data from 1st tile */
+          valign vaInData1 = IVP_LAN_2X32_PP(pvecIn1);
+
+          IVP_LAN_2X32_XP(vecInData1, vaInData1, pvecIn1, inTile1Pitch1 * 4);
+
+          /* load input data from 2nd tile */
+          valign vaInData2 = IVP_LAN_2X32_PP (pvecIn2);
+
+          IVP_LAN_2X32_XP(vecInData2, vaInData2, pvecIn2, inTile2Pitch1 * 4);
+
+          /* populate wide vectors with product of inputs */
+          xb_vecN_2x64w acc1;
+          acc1 = IVP_MULUSN_2X16X32_0(IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(vecInData2)), vecInData1);
+          IVP_MULAHN_2X16X32_1(acc1, IVP_MOVNX16_FROMN_2X32(vecInData2), vecInData1);
+
+          pdvecOut = (xb_vecN_2x32v *) pOut;
+          /* truncate the multiply result in wide vector into 32 bit format*/
+          xb_vecN_2x32v vecOutData = IVP_PACKLN_2X64W(acc1);
+          IVP_SAVN_2X32_XP(vecOutData, vaOutData, pdvecOut, varLen);
+
+          IVP_SAPOSN_2X32_FP(vaOutData, pdvecOut);
+          pOut += outTilePitch1;
+        } /* end of for (y = 0; y < dim2SizeOut; y++) loop */
+      }   /* end of for (z = 0; z < dim3SizeOut; z++) loop */
+    }   /* end of for (x = 0; x < dim1SizeOut; x += vectorizationWidth) loop */
+  } /* end of else */
+#endif
+  }
+  return(XAI_ERROR_STATUS());
+}
+#endif //#if XCHAL_HAVE_VISION
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_or.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_or.c
new file mode 100644
index 00000000000..511ee3b4c77
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_or.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn_common.h"
+
+
+#define ELTOR_DATA_TYPE  SIGNED8BIT
+#include "cnn_eltwise_or.h"
+#undef ELTOR_DATA_TYPE
+
+#define ELTOR_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_eltwise_or.h"
+#undef ELTOR_DATA_TYPE
+
+#define ELTOR_DATA_TYPE  SIGNED16BIT
+#include "cnn_eltwise_or.h"
+#undef ELTOR_DATA_TYPE
+
+#define ELTOR_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_eltwise_or.h"
+#undef ELTOR_DATA_TYPE
+
+#define ELTOR_DATA_TYPE  SIGNED32BIT
+#include "cnn_eltwise_or.h"
+#undef ELTOR_DATA_TYPE
+
+#define ELTOR_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_eltwise_or.h"
+#undef ELTOR_DATA_TYPE
+
+
+/**************************** xaiEltwiseOr3D_AV ******************************************/
+/* Description  : General API for auto-vectorizable Broadcast element-wise and           */
+/*                bitwise OR operator                                                    */
+/*                Calls one of the xaiEltwiseOr3D_AV functions based on the data type    */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE xaiEltwiseOr3D_AV(const xai_pTile3D inTile1,
+                               const xai_pTile3D inTile2,
+                               xai_pTile3D outTile)
+{
+  if (!inTile1 || !inTile2 || !outTile)
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8))
+  {
+    return(xaiEltwiseOr3D_S8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8))
+  {
+    return(xaiEltwiseOr3D_U8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16))
+  {
+    return(xaiEltwiseOr3D_S16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16))
+  {
+    return(xaiEltwiseOr3D_U16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32))
+  {
+    return(xaiEltwiseOr3D_S32_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32))
+  {
+    return(xaiEltwiseOr3D_U32_AV(inTile1, inTile2, outTile));
+  }
+
+  return(XAI_ERR_OK);
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_or.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_or.h
new file mode 100644
index 00000000000..a88b0bc7fb8
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_or.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn_common.h"
+
+
+#ifndef SIGNED8BIT
+#define SIGNED8BIT     1
+#define UNSIGNED8BIT   2
+#define SIGNED16BIT    3
+#define UNSIGNED16BIT  4
+#define SIGNED32BIT    5
+#define UNSIGNED32BIT  6
+#endif
+
+#if ELTOR_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S8
+#define MORPH_IDT_SCALAR  int8_t
+
+#elif ELTOR_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U8
+#define MORPH_IDT_SCALAR  uint8_t
+
+#elif ELTOR_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S16
+#define MORPH_IDT_SCALAR  int16_t
+
+#elif ELTOR_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U16
+#define MORPH_IDT_SCALAR  uint16_t
+
+#elif ELTOR_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S32
+#define MORPH_IDT_SCALAR  int32_t
+
+#elif ELTOR_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U32
+#define MORPH_IDT_SCALAR  uint32_t
+#endif
+
+#define OR2(a, b)  a | b
+
+
+/**************************** xaiEltwiseOr3D *********************************************/
+/* Description  : auto-vectorizable implementation of Broadcast elementWise and bitwise  */
+/*               OR operator, Based on MORPH implementation eight variants are           */
+/*               generated for S8, U8, S16, U16, S32, and U32 data types                 */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE MAKE_NAME (xaiEltwiseOr3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile1);
+    MORPH_IDT_CHECK(inTile2);
+    MORPH_IDT_CHECK(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2),
+                    XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2));
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile),
+                    XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile));
+    XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1);
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1SizeOut   = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t dim2SizeOut   = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t dim3SizeOut   = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1);
+  MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2);
+  MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  MORPH_IDT_SCALAR *__restrict pIn1;
+  MORPH_IDT_SCALAR *__restrict pIn2;
+  MORPH_IDT_SCALAR *__restrict pOut;
+
+  /* Get Pitch appropriate for elementwise broadcast operations */
+  XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \
+                                inTile2Pitch1, inTile1Pitch2, inTile2Pitch2);
+
+  /* no Broadcast */
+  if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2)
+  {
+    int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut;
+    pIn1 = pInput1;
+    pIn2 = pInput2;
+    pOut = pOutput;
+
+    for (int i = 0; i < dimsCount; i++)
+    {
+      pOut[i] = OR2(pIn1[i], pIn2[i]);
+    }
+  }
+  else
+  {
+    /*
+       inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting
+       inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting
+       inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting
+       inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting
+       inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting
+       inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting
+     */
+    int32_t y, z, idx;
+
+    for (z = 0; z < dim3SizeOut; z++)
+    {
+      MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2;
+      MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2;
+      MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2;
+
+      for (y = 0; y < dim2SizeOut; y++)
+      {
+        pIn1 = (temp1 + y * inTile1Pitch1);
+        pIn2 = (temp2 + y * inTile2Pitch1);
+        pOut = (temp3 + y * outTilePitch1);
+
+        MORPH_IDT_SCALAR InData1, InData2;
+        /* Tile1 Dimension 1 broadcasting */
+        if (inTile1Pitch0 == 0)
+        {
+          InData1 = pIn1[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData2   = pIn2[idx];
+            pOut[idx] = OR2(InData1, InData2);
+          }
+        }
+        /* Tile2 Dimension 1 broadcasting */
+        else if (inTile2Pitch0 == 0)
+        {
+          InData2 = pIn2[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1   = pIn1[idx];
+            pOut[idx] = OR2(InData1, InData2);
+          }
+        }
+        else
+        {
+          /* broadcast in dims 1 or 2 in Tile1 or TIle2 */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1   = pIn1[idx];
+            InData2   = pIn2[idx];
+            pOut[idx] = OR2(InData1, InData2);
+          }
+        }
+      }
+    }
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_sub.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_sub.c
new file mode 100644
index 00000000000..e2049e66f4f
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_sub.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn_common.h"
+
+
+#define ELTSUB_DATA_TYPE  SIGNED8BIT
+#include "cnn_eltwise_sub.h"
+#undef ELTSUB_DATA_TYPE
+
+#define ELTSUB_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_eltwise_sub.h"
+#undef ELTSUB_DATA_TYPE
+
+#define ELTSUB_DATA_TYPE  SIGNED16BIT
+#include "cnn_eltwise_sub.h"
+#undef ELTSUB_DATA_TYPE
+
+#define ELTSUB_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_eltwise_sub.h"
+#undef ELTSUB_DATA_TYPE
+
+#define ELTSUB_DATA_TYPE  SIGNED32BIT
+#include "cnn_eltwise_sub.h"
+#undef ELTSUB_DATA_TYPE
+
+#define ELTSUB_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_eltwise_sub.h"
+#undef ELTSUB_DATA_TYPE
+
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+#define ELTSUB_DATA_TYPE  FLOAT16BIT
+#include "cnn_eltwise_sub.h"
+#undef ELTSUB_DATA_TYPE
+#endif
+
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+#define ELTSUB_DATA_TYPE  FLOAT32BIT
+#include "cnn_eltwise_sub.h"
+#undef ELTSUB_DATA_TYPE
+#endif
+
+
+/**************************** xaiEltwiseSub3D_AV *****************************************/
+/* Description  : General API for auto-vectorizable Broadcast element-wise subtraction   */
+/*                Calls one of the xaiEltwiseSub3D_AV functions based on the data type   */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE xaiEltwiseSub3D_AV(const xai_pTile3D inTile1,
+                                const xai_pTile3D inTile2,
+                                xai_pTile3D outTile)
+{
+  if (!inTile1 || !inTile2 || !outTile)
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8))
+  {
+    return(xaiEltwiseSub3D_S8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8))
+  {
+    return(xaiEltwiseSub3D_U8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16))
+  {
+    return(xaiEltwiseSub3D_S16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16))
+  {
+    return(xaiEltwiseSub3D_U16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32))
+  {
+    return(xaiEltwiseSub3D_S32_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32))
+  {
+    return(xaiEltwiseSub3D_U32_AV(inTile1, inTile2, outTile));
+  }
+
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F16))
+  {
+    return(xaiEltwiseSub3D_F16_AV(inTile1, inTile2, outTile));
+  }
+#endif
+
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F32))
+  {
+    return(xaiEltwiseSub3D_F32_AV(inTile1, inTile2, outTile));
+  }
+#endif
+
+  return(XAI_ERR_OK);
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_sub.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_sub.h
new file mode 100644
index 00000000000..26aca7d8b13
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_sub.h
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn_common.h"
+
+
+#ifndef SIGNED8BIT
+#define SIGNED8BIT     1
+#define UNSIGNED8BIT   2
+#define SIGNED16BIT    3
+#define UNSIGNED16BIT  4
+#define SIGNED32BIT    5
+#define UNSIGNED32BIT  6
+#define FLOAT16BIT     7
+#define FLOAT32BIT     8
+#endif
+
+#if ELTSUB_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S8
+#define MORPH_IDT_SCALAR  int8_t
+
+#elif ELTSUB_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U8
+#define MORPH_IDT_SCALAR  uint8_t
+
+#elif ELTSUB_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S16
+#define MORPH_IDT_SCALAR  int16_t
+
+#elif ELTSUB_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U16
+#define MORPH_IDT_SCALAR  uint16_t
+
+#elif ELTSUB_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S32
+#define MORPH_IDT_SCALAR  int32_t
+
+#elif ELTSUB_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U32
+#define MORPH_IDT_SCALAR  uint32_t
+
+#elif ELTSUB_DATA_TYPE == FLOAT16BIT
+#if XCHAL_HAVE_VISION_HP_VFPU == 1
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _F16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_F16
+#define MORPH_IDT_SCALAR  xb_f16
+#endif
+
+#elif ELTSUB_DATA_TYPE == FLOAT32BIT
+#if XCHAL_HAVE_VISION_SP_VFPU == 1
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _F32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_F32
+#define MORPH_IDT_SCALAR  float
+#endif
+#endif
+
+#define SUB2(a, b)  a - b
+
+
+/**************************** xaiEltwiseSub3D ********************************************/
+/* Description  : auto-vectorizable implementation of Broadcast element-wise subtraction */
+/*               Based on MORPH implementation eight variants are generated for          */
+/*               S8, U8, S16, U16, S32, U32, F16 and F32 data types                      */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE MAKE_NAME (xaiEltwiseSub3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile1);
+    MORPH_IDT_CHECK(inTile2);
+    MORPH_IDT_CHECK(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2),
+                    XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2));
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile),
+                    XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile));
+    XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1);
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1SizeOut   = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t dim2SizeOut   = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t dim3SizeOut   = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1);
+  MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2);
+  MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  MORPH_IDT_SCALAR *__restrict pIn1;
+  MORPH_IDT_SCALAR *__restrict pIn2;
+  MORPH_IDT_SCALAR *__restrict pOut;
+
+  /* Get Pitch appropriate for elementwise broadcast operations */
+  XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \
+                                inTile2Pitch1, inTile1Pitch2, inTile2Pitch2);
+
+  /* no Broadcast */
+  if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2)
+  {
+    int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut;
+    pIn1 = pInput1;
+    pIn2 = pInput2;
+    pOut = pOutput;
+
+    for (int i = 0; i < dimsCount; i++)
+    {
+      pOut[i] = SUB2(pIn1[i], pIn2[i]);
+    }
+  }
+  else
+  {
+    /*
+       inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting
+       inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting
+       inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting
+       inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting
+       inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting
+       inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting
+     */
+    int32_t y, z, idx;
+
+    for (z = 0; z < dim3SizeOut; z++)
+    {
+      MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2;
+      MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2;
+      MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2;
+
+      for (y = 0; y < dim2SizeOut; y++)
+      {
+        pIn1 = (temp1 + y * inTile1Pitch1);
+        pIn2 = (temp2 + y * inTile2Pitch1);
+        pOut = (temp3 + y * outTilePitch1);
+
+        MORPH_IDT_SCALAR InData1, InData2;
+        /* Tile1 Dimension 1 broadcasting */
+        if (inTile1Pitch0 == 0)
+        {
+          InData1 = pIn1[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData2   = pIn2[idx];
+            pOut[idx] = SUB2(InData1, InData2);
+          }
+        }
+        /* Tile2 Dimension 1 broadcasting */
+        else if (inTile2Pitch0 == 0)
+        {
+          InData2 = pIn2[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1   = pIn1[idx];
+            pOut[idx] = SUB2(InData1, InData2);
+          }
+        }
+        else
+        {
+          /* broadcast in dims 1 or 2 in Tile1 or TIle2 */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1   = pIn1[idx];
+            InData2   = pIn2[idx];
+            pOut[idx] = SUB2(InData1, InData2);
+          }
+        }
+      }
+    }
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_xor.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_xor.c
new file mode 100644
index 00000000000..f1e97a03418
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_xor.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_cnn_common.h"
+
+
+#define ELTXOR_DATA_TYPE  SIGNED8BIT
+#include "cnn_eltwise_xor.h"
+#undef ELTXOR_DATA_TYPE
+
+#define ELTXOR_DATA_TYPE  UNSIGNED8BIT
+#include "cnn_eltwise_xor.h"
+#undef ELTXOR_DATA_TYPE
+
+#define ELTXOR_DATA_TYPE  SIGNED16BIT
+#include "cnn_eltwise_xor.h"
+#undef ELTXOR_DATA_TYPE
+
+#define ELTXOR_DATA_TYPE  UNSIGNED16BIT
+#include "cnn_eltwise_xor.h"
+#undef ELTXOR_DATA_TYPE
+
+#define ELTXOR_DATA_TYPE  SIGNED32BIT
+#include "cnn_eltwise_xor.h"
+#undef ELTXOR_DATA_TYPE
+
+#define ELTXOR_DATA_TYPE  UNSIGNED32BIT
+#include "cnn_eltwise_xor.h"
+#undef ELTXOR_DATA_TYPE
+
+
+/**************************** xaiEltwiseXor3D_AV *****************************************/
+/* Description  : General API for auto-vectorizable Broadcast element-wise and           */
+/*                bitwise XOR operator                                                   */
+/*                Calls one of the xaiEltwiseXor3D_AV functions based on the data type   */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE xaiEltwiseXor3D_AV(const xai_pTile3D inTile1,
+                                const xai_pTile3D inTile2,
+                                xai_pTile3D outTile)
+{
+  if (!inTile1 || !inTile2 || !outTile)
+  {
+    return(XAI_ERR_NULLARG);
+  }
+
+  if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8))
+  {
+    return(xaiEltwiseXor3D_S8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8))
+  {
+    return(xaiEltwiseXor3D_U8_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16))
+  {
+    return(xaiEltwiseXor3D_S16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16))
+  {
+    return(xaiEltwiseXor3D_U16_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32))
+  {
+    return(xaiEltwiseXor3D_S32_AV(inTile1, inTile2, outTile));
+  }
+  else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32))
+  {
+    return(xaiEltwiseXor3D_U32_AV(inTile1, inTile2, outTile));
+  }
+
+  return(XAI_ERR_OK);
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_xor.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_xor.h
new file mode 100644
index 00000000000..28cd63389f0
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_xor.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2018 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+#include "xai_cnn_common.h"
+
+
+#ifndef SIGNED8BIT
+#define SIGNED8BIT     1
+#define UNSIGNED8BIT   2
+#define SIGNED16BIT    3
+#define UNSIGNED16BIT  4
+#define SIGNED32BIT    5
+#define UNSIGNED32BIT  6
+#endif
+
+#if ELTXOR_DATA_TYPE == SIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S8
+#define MORPH_IDT_SCALAR  int8_t
+
+#elif ELTXOR_DATA_TYPE == UNSIGNED8BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U8_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U8
+#define MORPH_IDT_SCALAR  uint8_t
+
+#elif ELTXOR_DATA_TYPE == SIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S16
+#define MORPH_IDT_SCALAR  int16_t
+
+#elif ELTXOR_DATA_TYPE == UNSIGNED16BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U16_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U16
+#define MORPH_IDT_SCALAR  uint16_t
+
+#elif ELTXOR_DATA_TYPE == SIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _S32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_S32
+#define MORPH_IDT_SCALAR  int32_t
+
+#elif ELTXOR_DATA_TYPE == UNSIGNED32BIT
+#undef MAKE_NAME
+#undef MORPH_IDT_CHECK
+#undef MORPH_IDT_SCALAR
+#define MAKE_NAME(name)  name ## _U32_AV
+#define MORPH_IDT_CHECK   XAI_CHECK_TILE3D_U32
+#define MORPH_IDT_SCALAR  uint32_t
+#endif
+
+#define XOR2(a, b)  a ^ b
+
+
+/**************************** xaiEltwiseXor3D *********************************************/
+/* Description  : auto-vectorizable implementation of Broadcast elementWise and bitwise  */
+/*               XOR operator, Based on MORPH implementation eight variants are          */
+/*               generated for S8, U8, S16, U16, S32, and U32 data types                 */
+/* Inputs       : inTile1, inTile2                                                       */
+/* Outputs      : XI Error Code                                                          */
+/* InOuts       : outTile                                                                */
+/*****************************************************************************************/
+
+XAI_ERR_TYPE MAKE_NAME (xaiEltwiseXor3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile)
+{
+  /* Error Checks */
+  XAI_ERROR_CHECKS()
+  {
+    MORPH_IDT_CHECK(inTile1);
+    MORPH_IDT_CHECK(inTile2);
+    MORPH_IDT_CHECK(outTile);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2);
+    XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile);
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2),
+                    XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2));
+    XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile),
+                    XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \
+                    XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile));
+    XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1);
+  }
+
+  /* Get Tile Parameters */
+  const int32_t dim1SizeOut   = XAI_TILE3D_GET_DIM1(outTile);
+  const int32_t dim2SizeOut   = XAI_TILE3D_GET_DIM2(outTile);
+  const int32_t dim3SizeOut   = XAI_TILE3D_GET_DIM3(outTile);
+  const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile);
+  const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile);
+
+  /* Get Data Pointers */
+  MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1);
+  MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2);
+  MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile);
+
+  MORPH_IDT_SCALAR *__restrict pIn1;
+  MORPH_IDT_SCALAR *__restrict pIn2;
+  MORPH_IDT_SCALAR *__restrict pOut;
+
+  /* Get Pitch appropriate for elementwise broadcast operations */
+  XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \
+                                inTile2Pitch1, inTile1Pitch2, inTile2Pitch2);
+
+  /* no Broadcast */
+  if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2)
+  {
+    int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut;
+    pIn1 = pInput1;
+    pIn2 = pInput2;
+    pOut = pOutput;
+
+    for (int i = 0; i < dimsCount; i++)
+    {
+      pOut[i] = XOR2(pIn1[i], pIn2[i]);
+    }
+  }
+  else
+  {
+    /*
+       inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting
+       inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting
+       inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting
+       inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting
+       inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting
+       inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting
+     */
+    int32_t y, z, idx;
+
+    for (z = 0; z < dim3SizeOut; z++)
+    {
+      MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2;
+      MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2;
+      MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2;
+
+      for (y = 0; y < dim2SizeOut; y++)
+      {
+        pIn1 = (temp1 + y * inTile1Pitch1);
+        pIn2 = (temp2 + y * inTile2Pitch1);
+        pOut = (temp3 + y * outTilePitch1);
+
+        MORPH_IDT_SCALAR InData1, InData2;
+        /* Tile1 Dimension 1 broadcasting */
+        if (inTile1Pitch0 == 0)
+        {
+          InData1 = pIn1[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData2   = pIn2[idx];
+            pOut[idx] = XOR2(InData1, InData2);
+          }
+        }
+        /* Tile2 Dimension 1 broadcasting */
+        else if (inTile2Pitch0 == 0)
+        {
+          InData2 = pIn2[0];
+          /* reduced one load from core loop */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1   = pIn1[idx];
+            pOut[idx] = XOR2(InData1, InData2);
+          }
+        }
+        else
+        {
+          /* broadcast in dims 1 or 2 in Tile1 or TIle2 */
+          for (idx = 0; idx < dim1SizeOut; idx++)
+          {
+            InData1   = pIn1[idx];
+            InData2   = pIn2[idx];
+            pOut[idx] = XOR2(InData1, InData2);
+          }
+        }
+      }
+    }
+  }
+
+  return(XAI_ERROR_STATUS());
+}
+
diff --git a/backends/cadence/vision/third-party/libxai_common/src/xai_buildinfo.c b/backends/cadence/vision/third-party/libxai_common/src/xai_buildinfo.c
new file mode 100644
index 00000000000..87665867c54
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/xai_buildinfo.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2021 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_core.h"
+
+// XI library build configuration - Release/Debug or other
+char XAI_BUILD_CONFIGURATION[] = XAI_AUX_STR(_XAI_BUILD_CONFIGURATION_);
+
+// XTRENSA tools version
+char XAI_BUILD_TOOLS_VERSION[] = XAI_AUX_STR(_XAI_BUILD_TOOLS_VERSION_);
+
+// target core name and hardware name
+char XAI_BUILD_CORE_ID[] =
+#if defined(XCHAL_CORE_ID) && defined(XCHAL_HW_VERSION_NAME)
+  XCHAL_CORE_ID " (" XCHAL_HW_VERSION_NAME ")"
+#elif defined(XCHAL_CORE_ID)
+  XCHAL_CORE_ID
+#else
+  "CSTUB (x86)"
+#endif
+;
+
+// error level
+char XAI_BUILD_ERROR_LEVEL[] =
+#if XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_PRINT_AND_CONTINUE_ON_ERROR
+  "PRINT_AND_CONTINUE_ON_ERROR (" XAI_AUX_STR(XAI_ERROR_LEVEL_PRINT_AND_CONTINUE_ON_ERROR) ")"
+#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_PRINT_ON_ERROR
+  "PRINT_ON_ERROR (" XAI_AUX_STR(XAI_ERROR_LEVEL_PRINT_ON_ERROR) ")"
+#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_CONTINUE_ON_ERROR
+  "CONTINUE_ON_ERROR (" XAI_AUX_STR(XAI_ERROR_LEVEL_CONTINUE_ON_ERROR) ")"
+#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_RETURN_ON_ERROR
+  "RETURN_ON_ERROR (" XAI_AUX_STR(XAI_ERROR_LEVEL_RETURN_ON_ERROR) ")"
+#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_TERMINATE_ON_ERROR
+  "TERMINATE_ON_ERROR (" XAI_AUX_STR(XAI_ERROR_LEVEL_TERMINATE_ON_ERROR) ")"
+#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_NO_ERROR
+  "NO_ERROR (" XAI_AUX_STR(XAI_ERROR_LEVEL_NO_ERROR) ")"
+#else
+  XAI_AUX_STR(XAI_ERROR_LEVEL)
+#endif
+;
+
+// library features
+char XAI_BUILD_FEATURES_STR[] = ""
+#if __XTENSA__ && XAI_EMULATE_LOCAL_RAM && XAI_ERROR_LEVEL != XAI_ERROR_LEVEL_NO_ERROR
+                                "DRAM_CHECK "
+#endif
+;
diff --git a/backends/cadence/vision/third-party/libxai_common/src/xai_errstr.c b/backends/cadence/vision/third-party/libxai_common/src/xai_errstr.c
new file mode 100644
index 00000000000..be08daac86f
--- /dev/null
+++ b/backends/cadence/vision/third-party/libxai_common/src/xai_errstr.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 by Cadence Design Systems, Inc.  ALL RIGHTS RESERVED.
+ * These coded instructions, statements, and computer programs are the
+ * copyrighted works and confidential proprietary information of
+ * Cadence Design Systems Inc.  They may be adapted and modified by bona fide
+ * purchasers for internal use, but neither the original nor any adapted
+ * or modified version may be disclosed or distributed to third parties
+ * in any manner, medium, or form, in whole or in part, without the prior
+ * written consent of Cadence Design Systems Inc.  This software and its
+ * derivatives are to be executed solely on products incorporating a Cadence
+ * Design Systems processor.
+ */
+
+#include "xai_core.h"
+
+const char* xaiErrStr(XAI_ERR_TYPE code)
+{
+  switch (code)
+  {
+    case XAI_ERR_OK:             return("No error");
+    case XAI_ERR_IALIGNMENT:     return("Input alignment requirements are not satisfied");
+    case XAI_ERR_OALIGNMENT:     return("Output alignment requirements are not satisfied");
+    case XAI_ERR_MALIGNMENT:     return("Same modulo alignment requirement is not satisfied");
+    case XAI_ERR_BADARG:         return("Function arguments are somehow invalid");
+    case XAI_ERR_MEMLOCAL:       return("Tile is not placed in local memory");
+    case XAI_ERR_INPLACE:        return("Inplace operation is not supported");
+    case XAI_ERR_EDGE:           return("Edge extension size is too small");
+    case XAI_ERR_DATASIZE:       return("Input/output tile size is too small or too big or otherwise inconsistent");
+    case XAI_ERR_TMPSIZE:        return("Temporary tile size is too small or otherwise inconsistent");
+    case XAI_ERR_KSIZE:          return("Filer kernel size is not supported");
+    case XAI_ERR_NORM:           return("Invalid normalization divisor or shift value");
+    case XAI_ERR_COORD:          return("Tile coordinates are invalid");
+    case XAI_ERR_BADTRANSFORM:   return("Transform is singular or otherwise invalid");
+    case XAI_ERR_NULLARG:        return("One of required arguments is NULL");
+    case XAI_ERR_THRESH_INVALID: return("Threshold value is somehow invalid");
+    case XAI_ERR_SCALE:          return("Provided scale factor is not supported");
+    case XAI_ERR_OVERFLOW:       return("Tile size can lead to sum overflow");
+    case XAI_ERR_NOTIMPLEMENTED: return("The requested functionality is absent in current version of XI Library");
+    case XAI_ERR_CHANNEL_INVALID: return("Channel number is somehow invalid");
+    case XAI_ERR_DATATYPE:       return("Argument has invalid data type");
+    case XAI_ERR_NO_VARIANT:     return("No suitable variant of the function is available");
+    case XAI_ERR_CUSTOMACC_PREPARE: return("Preparing custom acc hardware fails");
+    case XAI_ERR_CUSTOMACC_EXECUTE: return("Executing ops on custom acc hardware fails");
+    case XAI_ERR_CUSTOMACC_REMOVE:  return("Removing a network for custom acc hardware fails");
+
+    case XAI_ERR_POOR_DECOMPOSITION: return("Computed transform decomposition can produce visual artifacts");
+    case XAI_ERR_OUTOFTILE:      return("The arguments or results are out of tile");
+    case XAI_ERR_OBJECTLOST:     return("Tracked object is lost");
+    case XAI_ERR_RANSAC_NOTFOUND: return("Unable to find an appropriate model for RANSAC");
+    case XAI_ERR_REPLAY:         return("Repeated function call is required for completion");
+  }
+  ;
+  return("Unknown error");
+}
+

From c2a48d2f5d04292d0a3f40c6c6023cdb2e750ad1 Mon Sep 17 00:00:00 2001
From: Suraj Raut <sraut@cadence.com>
Date: Fri, 29 May 2026 00:49:33 -0700
Subject: [PATCH 2/7] Reset non-Cadence files to upstream/main

---
 .ci/docker/common/install_linter.sh           |   4 -
 .github/workflows/lint.yml                    |  46 ---
 .github/workflows/mlx.yml                     |   4 +
 1                                             |   1 +
 backends/arm/tosa/partitioner.py              |   5 +-
 backends/xnnpack/third-party/XNNPACK          |   2 +-
 backends/xnnpack/third-party/cpuinfo          |   2 +-
 backends/xnnpack/third-party/pthreadpool      |   2 +-
 extension/llm/tokenizers                      |   2 +-
 kernels/portable/cpu/op__device_copy.cpp      | 154 +++++++++
 kernels/portable/functions.yaml               |  10 +
 kernels/test/op__device_copy_test.cpp         | 297 ++++++++++++++++++
 kernels/test/targets.bzl                      |  14 +-
 resnet18_log1.log                             |   0
 shim                                          |   2 +-
 shim_et/xplat/executorch/codegen/codegen.bzl  |   1 +
 .../kernels/portable/op_registration_util.bzl |   6 +
 third-party/ao                                |   2 +-
 third-party/pocketfft                         |   2 +-
 19 files changed, 495 insertions(+), 61 deletions(-)
 create mode 100644 1
 create mode 100644 kernels/portable/cpu/op__device_copy.cpp
 create mode 100644 kernels/test/op__device_copy_test.cpp
 create mode 100644 resnet18_log1.log

diff --git a/.ci/docker/common/install_linter.sh b/.ci/docker/common/install_linter.sh
index 52d2d262685..4a796a72d54 100755
--- a/.ci/docker/common/install_linter.sh
+++ b/.ci/docker/common/install_linter.sh
@@ -13,7 +13,3 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 # NB: Install all linter dependencies, the caching of lintrunner init could be
 # done after Executorch becomes public
 pip_install -r requirements-lintrunner.txt
-
-# Install google-java-format
-curl -L --retry 3 --retry-all-errors https://github.com/google/google-java-format/releases/download/v1.23.0/google-java-format_linux-x86-64 > /opt/google-java-format
-chmod +x /opt/google-java-format
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index b26247d2333..b21cc527b8d 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -125,49 +125,3 @@ jobs:
     uses: ./.github/workflows/_link_check.yml
     with:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-
-  android-java-format:
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-
-      - uses: actions/setup-java@v4
-        with:
-          distribution: 'temurin'
-          java-version: '17'
-
-      - name: Check Java formatting
-        run: |
-          GOOGLE_JAVA_FORMAT_VERSION="1.24.0"
-          curl -sSfL "https://github.com/google/google-java-format/releases/download/v${GOOGLE_JAVA_FORMAT_VERSION}/google-java-format-${GOOGLE_JAVA_FORMAT_VERSION}-all-deps.jar" \
-            -o /tmp/google-java-format.jar
-
-          FILES_NEEDS_FORMAT=$(find extension/android/executorch_android/src/main/java/org/pytorch/executorch \
-                              extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm \
-                              extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations \
-                              extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch \
-                              extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench \
-                              extension/benchmark/android/benchmark/app/src/androidTest/java/org/pytorch/minibench \
-                              -type f -name "*.java" 2>/dev/null | \
-                              xargs -r java -jar /tmp/google-java-format.jar -n)
-
-          if [ -n "$FILES_NEEDS_FORMAT" ]; then
-            echo "Warning: The following files need formatting:"
-            echo "$FILES_NEEDS_FORMAT"
-            echo ""
-            echo "Please use google-java-format from https://github.com/google/google-java-format/releases/"
-            echo ""
-            echo "To fix, run one of these commands:"
-            echo "  # Using xargs (recommended):"
-            echo "  find <paths> -type f -name '*.java' | xargs google-java-format -i"
-            echo ""
-            echo "  # Or format specific files:"
-            echo "$FILES_NEEDS_FORMAT" | while IFS= read -r file; do
-              echo "  google-java-format -i \"$file\""
-            done
-            exit 1
-          fi
diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
index c4be146f862..027101ba7f0 100644
--- a/.github/workflows/mlx.yml
+++ b/.github/workflows/mlx.yml
@@ -47,6 +47,10 @@ jobs:
 
         ${CONDA_RUN} pip list
 
+        echo "::group::Install Python test requirements"
+        ${CONDA_RUN} pip install gguf
+        echo "::endgroup::"
+
         echo "::group::Build test runners"
         ${CONDA_RUN} cmake --build cmake-out --target op_test_runner multi_thread_test_runner -j$(( $(sysctl -n hw.ncpu) - 1 ))
         echo "::endgroup::"
diff --git a/1 b/1
new file mode 100644
index 00000000000..8462b88277b
--- /dev/null
+++ b/1
@@ -0,0 +1 @@
+usage: list-sessions [-F format]
diff --git a/backends/arm/tosa/partitioner.py b/backends/arm/tosa/partitioner.py
index d93e212c314..37b9cd7cc2a 100644
--- a/backends/arm/tosa/partitioner.py
+++ b/backends/arm/tosa/partitioner.py
@@ -550,7 +550,10 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         partition_tags = {tag: self.delegation_spec for tag in tags}
 
         tag_constant_data(exported_program)
-        if self.intermediate_path is not None and logger.level <= logging.INFO:
+        if (
+            self.intermediate_path is not None
+            and logger.getEffectiveLevel() <= logging.INFO
+        ):
             intermediate_path = Path(self.intermediate_path)
             intermediate_path.mkdir(parents=True, exist_ok=True)
             file_handler = logging.FileHandler(
diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK
index 1adaa7c709d..3131afead79 160000
--- a/backends/xnnpack/third-party/XNNPACK
+++ b/backends/xnnpack/third-party/XNNPACK
@@ -1 +1 @@
-Subproject commit 1adaa7c709d4839d29e1f219cb962b01c9e6a905
+Subproject commit 3131afead790c5c69a9aa12273dfc40399789ad7
diff --git a/backends/xnnpack/third-party/cpuinfo b/backends/xnnpack/third-party/cpuinfo
index f9a03241f8c..8a9210069b5 160000
--- a/backends/xnnpack/third-party/cpuinfo
+++ b/backends/xnnpack/third-party/cpuinfo
@@ -1 +1 @@
-Subproject commit f9a03241f8c3d4ed0c9728f5d70bff873d43d4e0
+Subproject commit 8a9210069b5a37dd89ed118a783945502a30a4ae
diff --git a/backends/xnnpack/third-party/pthreadpool b/backends/xnnpack/third-party/pthreadpool
index a56dcd79c69..c2ba5c50bb5 160000
--- a/backends/xnnpack/third-party/pthreadpool
+++ b/backends/xnnpack/third-party/pthreadpool
@@ -1 +1 @@
-Subproject commit a56dcd79c699366e7ac6466792c3025883ff7704
+Subproject commit c2ba5c50bb58d1397b693740cf75fad836a0d1bf
diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers
index b642403834a..3aada3fe28c 160000
--- a/extension/llm/tokenizers
+++ b/extension/llm/tokenizers
@@ -1 +1 @@
-Subproject commit b642403834a67c8ef14a7109dcd1bb5e5f3cb68a
+Subproject commit 3aada3fe28c945d14d5ec62254eb56ccdf10eb11
diff --git a/kernels/portable/cpu/op__device_copy.cpp b/kernels/portable/cpu/op__device_copy.cpp
new file mode 100644
index 00000000000..5e1a51a83be
--- /dev/null
+++ b/kernels/portable/cpu/op__device_copy.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * Runtime kernels for et_copy._h2d_copy and et_copy._d2h_copy ops.
+ *
+ * These ops transfer tensor data between CPU and device memory using
+ * the DeviceAllocator interface. The device type is inferred from the
+ * tensor metadata (out.device_type() for H2D, self.device_type() for D2H),
+ * which was set during AOT serialization by PropagateDevicePass.
+ */
+
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+using DeviceAllocator = executorch::runtime::DeviceAllocator;
+using Error = executorch::runtime::Error;
+
+/**
+ * Copies tensor data from host (CPU) memory to device memory.
+ *
+ * self: source tensor on CPU
+ * out:  destination tensor on device (memory-planned by runtime)
+ *
+ * The device type and index are inferred from out's TensorImpl metadata.
+ */
+Tensor&
+_h2d_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) {
+  auto device_type = out.unsafeGetTensorImpl()->device_type();
+  auto device_index = out.unsafeGetTensorImpl()->device_index();
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      self.unsafeGetTensorImpl()->device_type() ==
+          executorch::runtime::etensor::DeviceType::CPU,
+      InvalidArgument,
+      out,
+      "_h2d_copy: source tensor must be on CPU, got device_type=%d",
+      static_cast<int>(self.unsafeGetTensorImpl()->device_type()));
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      device_type != executorch::runtime::etensor::DeviceType::CPU,
+      InvalidArgument,
+      out,
+      "_h2d_copy: destination tensor must be on a non-CPU device");
+
+  auto nbytes = self.nbytes();
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      nbytes == out.nbytes(),
+      InvalidArgument,
+      out,
+      "_h2d_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu",
+      nbytes,
+      out.nbytes());
+
+  DeviceAllocator* allocator =
+      executorch::runtime::get_device_allocator(device_type);
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      allocator != nullptr,
+      NotFound,
+      out,
+      "_h2d_copy: no device allocator registered for device_type=%d",
+      static_cast<int>(device_type));
+
+  Error err = allocator->copy_host_to_device(
+      out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index);
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      err == Error::Ok,
+      Internal,
+      out,
+      "_h2d_copy: copy_host_to_device failed");
+
+  return out;
+}
+
+/**
+ * Copies tensor data from device memory to host (CPU) memory.
+ *
+ * self: source tensor on device
+ * out:  destination tensor on CPU (memory-planned by runtime)
+ *
+ * The device type and index are inferred from self's TensorImpl metadata.
+ */
+Tensor&
+_d2h_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) {
+  auto device_type = self.unsafeGetTensorImpl()->device_type();
+  auto device_index = self.unsafeGetTensorImpl()->device_index();
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      device_type != executorch::runtime::etensor::DeviceType::CPU,
+      InvalidArgument,
+      out,
+      "_d2h_copy: source tensor must be on a non-CPU device");
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      out.unsafeGetTensorImpl()->device_type() ==
+          executorch::runtime::etensor::DeviceType::CPU,
+      InvalidArgument,
+      out,
+      "_d2h_copy: destination tensor must be on CPU, got device_type=%d",
+      static_cast<int>(out.unsafeGetTensorImpl()->device_type()));
+
+  auto nbytes = self.nbytes();
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      nbytes == out.nbytes(),
+      InvalidArgument,
+      out,
+      "_d2h_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu",
+      nbytes,
+      out.nbytes());
+
+  DeviceAllocator* allocator =
+      executorch::runtime::get_device_allocator(device_type);
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      allocator != nullptr,
+      NotFound,
+      out,
+      "_d2h_copy: no device allocator registered for device_type=%d",
+      static_cast<int>(device_type));
+
+  Error err = allocator->copy_device_to_host(
+      out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index);
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      err == Error::Ok,
+      Internal,
+      out,
+      "_d2h_copy: copy_device_to_host failed");
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
index 620d97d050f..ecf62ee3606 100644
--- a/kernels/portable/functions.yaml
+++ b/kernels/portable/functions.yaml
@@ -1045,6 +1045,16 @@
     - arg_meta: null
       kernel_name: torch::executor::zeros_out
 
+- func: et_copy::_h2d_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::_h2d_copy_out
+
+- func: et_copy::_d2h_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::_d2h_copy_out
+
 - func: dim_order_ops::_empty_dim_order.out(int[] size, *, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
diff --git a/kernels/test/op__device_copy_test.cpp b/kernels/test/op__device_copy_test.cpp
new file mode 100644
index 00000000000..d345642bd37
--- /dev/null
+++ b/kernels/test/op__device_copy_test.cpp
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * Tests for et_copy._h2d_copy.out and et_copy._d2h_copy.out runtime kernels.
+ *
+ * Uses a MockDeviceAllocator to verify that the kernels correctly call
+ * copy_host_to_device / copy_device_to_host via the DeviceAllocator interface,
+ * and that device type is inferred from tensor metadata.
+ */
+
+#include <gtest/gtest.h>
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/portable_type/tensor_impl.h>
+#include <executorch/runtime/platform/runtime.h>
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::runtime::DeviceAllocator;
+using executorch::runtime::Error;
+using executorch::runtime::get_device_allocator;
+using executorch::runtime::register_device_allocator;
+using executorch::runtime::Result;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+using TensorShapeDynamism = executorch::runtime::TensorShapeDynamism;
+
+namespace {
+
+class MockDeviceAllocator : public DeviceAllocator {
+ public:
+  Result<void*> allocate(
+      size_t nbytes,
+      DeviceIndex index,
+      size_t alignment = kDefaultAlignment) override {
+    return Error::NotSupported;
+  }
+
+  void deallocate(void* ptr, DeviceIndex index) override {}
+
+  Error copy_host_to_device(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex index) override {
+    h2d_call_count_++;
+    last_h2d_nbytes_ = nbytes;
+    last_h2d_device_index_ = index;
+    // Actually copy so we can verify data
+    std::memcpy(dst, src, nbytes);
+    return Error::Ok;
+  }
+
+  Error copy_device_to_host(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex index) override {
+    d2h_call_count_++;
+    last_d2h_nbytes_ = nbytes;
+    last_d2h_device_index_ = index;
+    std::memcpy(dst, src, nbytes);
+    return Error::Ok;
+  }
+
+  DeviceType device_type() const override {
+    return DeviceType::CUDA;
+  }
+
+  int h2d_call_count_ = 0;
+  int d2h_call_count_ = 0;
+  size_t last_h2d_nbytes_ = 0;
+  size_t last_d2h_nbytes_ = 0;
+  DeviceIndex last_h2d_device_index_ = -1;
+  DeviceIndex last_d2h_device_index_ = -1;
+};
+
+} // namespace
+
+static MockDeviceAllocator g_mock_cuda;
+
+class OpDeviceCopyTest : public OperatorTest {
+ protected:
+  Tensor& op_h2d_copy_out(const Tensor& self, Tensor& out) {
+    return torch::executor::et_copy::_h2d_copy_outf(context_, self, out);
+  }
+
+  Tensor& op_d2h_copy_out(const Tensor& self, Tensor& out) {
+    return torch::executor::et_copy::_d2h_copy_outf(context_, self, out);
+  }
+
+  static void SetUpTestSuite() {
+    executorch::runtime::runtime_init();
+    if (get_device_allocator(DeviceType::CUDA) == nullptr) {
+      register_device_allocator(&g_mock_cuda);
+    }
+  }
+
+  void SetUp() override {
+    OperatorTest::SetUp();
+    g_mock_cuda.h2d_call_count_ = 0;
+    g_mock_cuda.d2h_call_count_ = 0;
+    g_mock_cuda.last_h2d_nbytes_ = 0;
+    g_mock_cuda.last_d2h_nbytes_ = 0;
+    g_mock_cuda.last_h2d_device_index_ = -1;
+    g_mock_cuda.last_d2h_device_index_ = -1;
+  }
+};
+
+TEST_F(OpDeviceCopyTest, H2dCopyCopiesDataAndCallsAllocator) {
+  // Set up a CPU source tensor with known data.
+  float src_data[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  int32_t sizes[] = {4};
+  uint8_t dim_order[] = {0};
+  int32_t strides[] = {1};
+  TensorImpl src_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      src_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor src(&src_impl);
+
+  // Set up a CUDA destination tensor (simulated with host memory).
+  float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f};
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      dst_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      0);
+  Tensor dst(&dst_impl);
+
+  Tensor& result = op_h2d_copy_out(src, dst);
+
+  // Verify the allocator was called correctly.
+  EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1);
+  EXPECT_EQ(g_mock_cuda.last_h2d_nbytes_, 4 * sizeof(float));
+  EXPECT_EQ(g_mock_cuda.last_h2d_device_index_, 0);
+
+  // Verify data was copied (mock does a real memcpy).
+  EXPECT_EQ(dst_data[0], 1.0f);
+  EXPECT_EQ(dst_data[1], 2.0f);
+  EXPECT_EQ(dst_data[2], 3.0f);
+  EXPECT_EQ(dst_data[3], 4.0f);
+
+  // Verify return value is the out tensor.
+  EXPECT_EQ(&result, &dst);
+}
+
+TEST_F(OpDeviceCopyTest, D2hCopyCopiesDataAndCallsAllocator) {
+  // Set up a CUDA source tensor with known data.
+  float src_data[] = {5.0f, 6.0f, 7.0f, 8.0f};
+  int32_t sizes[] = {4};
+  uint8_t dim_order[] = {0};
+  int32_t strides[] = {1};
+  TensorImpl src_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      src_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      0);
+  Tensor src(&src_impl);
+
+  // Set up a CPU destination tensor.
+  float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f};
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      dst_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor dst(&dst_impl);
+
+  Tensor& result = op_d2h_copy_out(src, dst);
+
+  // Verify the allocator was called correctly.
+  EXPECT_EQ(g_mock_cuda.d2h_call_count_, 1);
+  EXPECT_EQ(g_mock_cuda.last_d2h_nbytes_, 4 * sizeof(float));
+  EXPECT_EQ(g_mock_cuda.last_d2h_device_index_, 0);
+
+  // Verify data was copied.
+  EXPECT_EQ(dst_data[0], 5.0f);
+  EXPECT_EQ(dst_data[1], 6.0f);
+  EXPECT_EQ(dst_data[2], 7.0f);
+  EXPECT_EQ(dst_data[3], 8.0f);
+
+  EXPECT_EQ(&result, &dst);
+}
+
+TEST_F(OpDeviceCopyTest, H2dCopyWithDeviceIndex1) {
+  // Verify device_index is correctly forwarded to the allocator.
+  float src_data[] = {1.0f};
+  float dst_data[] = {0.0f};
+  int32_t sizes[] = {1};
+  uint8_t dim_order[] = {0};
+  int32_t strides[] = {1};
+
+  TensorImpl src_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      src_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor src(&src_impl);
+
+  // Device index = 1 (e.g., cuda:1)
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      dst_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      1);
+  Tensor dst(&dst_impl);
+
+  op_h2d_copy_out(src, dst);
+
+  EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1);
+  EXPECT_EQ(g_mock_cuda.last_h2d_device_index_, 1);
+}
+
+TEST_F(OpDeviceCopyTest, H2dCopyMultidimensionalTensor) {
+  // Test with a 2D tensor [2, 3].
+  float src_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+  int32_t sizes[] = {2, 3};
+  uint8_t dim_order[] = {0, 1};
+  int32_t strides[] = {3, 1};
+
+  TensorImpl src_impl(
+      ScalarType::Float,
+      2,
+      sizes,
+      src_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor src(&src_impl);
+
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      2,
+      sizes,
+      dst_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      0);
+  Tensor dst(&dst_impl);
+
+  op_h2d_copy_out(src, dst);
+
+  EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1);
+  EXPECT_EQ(g_mock_cuda.last_h2d_nbytes_, 6 * sizeof(float));
+
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(dst_data[i], src_data[i]);
+  }
+}
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index bc51e336cb8..5212d691c5b 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -1,14 +1,14 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/kernels/test:util.bzl", "codegen_function_header_wrapper", "op_test")
 
-def _common_op_test(name, kernels):
+def _common_op_test(name, kernels, deps = []):
     """
     Defines test targets in format of <kernel>_op_<op-name>_test
     For ATen kernel testing, let's use portable functions.yaml for tested ops.
     """
     for kernel in kernels:
-        deps = [":function_header_wrapper_{}".format(kernel)]
-        op_test(name, kernel_name = kernel, use_kernel_prefix = True, deps = deps)
+        op_deps = [":function_header_wrapper_{}".format(kernel)] + deps
+        op_test(name, kernel_name = kernel, use_kernel_prefix = True, deps = op_deps)
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -177,6 +177,14 @@ def define_common_targets():
     _common_op_test("op__clone_dim_order_test", ["aten", "portable"])
     _common_op_test("op__conj_physical_test", ["aten", "portable"])
     _common_op_test("op__adaptive_avg_pool2d_test", ["aten", "portable"])
+    _common_op_test(
+        "op__device_copy_test",
+        ["portable"],
+        deps = [
+            "//executorch/runtime/core:device_allocator",
+            "//executorch/runtime/platform:platform",
+        ],
+    )
     _common_op_test("op_abs_test", ["aten", "portable"])
     _common_op_test("op_acos_test", ["aten", "portable"])
     _common_op_test("op_acosh_test", ["aten", "portable"])
diff --git a/resnet18_log1.log b/resnet18_log1.log
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/shim b/shim
index b295819bb0e..cf6a954aae4 160000
--- a/shim
+++ b/shim
@@ -1 +1 @@
-Subproject commit b295819bb0ec636b4e3359828e05476d2437650a
+Subproject commit cf6a954aae4bee7b4515e13475878460115027d1
diff --git a/shim_et/xplat/executorch/codegen/codegen.bzl b/shim_et/xplat/executorch/codegen/codegen.bzl
index 5ffa7b65a36..318996784a1 100644
--- a/shim_et/xplat/executorch/codegen/codegen.bzl
+++ b/shim_et/xplat/executorch/codegen/codegen.bzl
@@ -535,6 +535,7 @@ def get_portable_lib_deps():
         "//executorch/kernels/portable/cpu:vec_ops",
         "//executorch/kernels/portable/cpu/pattern:all_deps",
         "//executorch/kernels/portable/cpu/util:all_deps",
+        "//executorch/runtime/core:device_allocator",
     ]
 
 def get_optimized_lib_deps():
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
index cc2a0f78c75..479f3913f8f 100644
--- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -1405,6 +1405,12 @@ ATEN_OPS = (
             "//executorch/kernels/portable/cpu/util:copy_ops_util",
         ],
     ),
+    op_target(
+        name = "op__device_copy",
+        deps = [
+            "//executorch/runtime/core:device_allocator",
+        ],
+    ),
 )
 
 # Operators that are not listed in `functions.yaml` (i.e., operators listed in
diff --git a/third-party/ao b/third-party/ao
index 02105d46c61..01849b2b19c 160000
--- a/third-party/ao
+++ b/third-party/ao
@@ -1 +1 @@
-Subproject commit 02105d46c61dc80a8c9d39d5836e827ba3af8439
+Subproject commit 01849b2b19cb923cb739a1fc02297ba418ddf715
diff --git a/third-party/pocketfft b/third-party/pocketfft
index 81874074463..0fa0ef591e3 160000
--- a/third-party/pocketfft
+++ b/third-party/pocketfft
@@ -1 +1 @@
-Subproject commit 8187407446316c3d16f15e5395dabd4b22f4fec7
+Subproject commit 0fa0ef591e38c2758e3184c6c23e497b9f732ffa

From 9991681b1204d743e7068d7ea496e6494356ea51 Mon Sep 17 00:00:00 2001
From: Suraj Raut <sraut@cadence.com>
Date: Fri, 29 May 2026 00:49:48 -0700
Subject: [PATCH 3/7] Remove accidental files

---
 1                 | 1 -
 resnet18_log1.log | 0
 2 files changed, 1 deletion(-)
 delete mode 100644 1
 delete mode 100644 resnet18_log1.log

diff --git a/1 b/1
deleted file mode 100644
index 8462b88277b..00000000000
--- a/1
+++ /dev/null
@@ -1 +0,0 @@
-usage: list-sessions [-F format]
diff --git a/resnet18_log1.log b/resnet18_log1.log
deleted file mode 100644
index e69de29bb2d..00000000000

From 3dd55592cd2c01a5635c69931076570191ea7066 Mon Sep 17 00:00:00 2001
From: Suraj Raut <sraut@cadence.com>
Date: Fri, 29 May 2026 00:51:51 -0700
Subject: [PATCH 4/7] Sync submodule pointers with upstream/main

---
 shim                  | 2 +-
 third-party/ao        | 2 +-
 third-party/pocketfft | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/shim b/shim
index cf6a954aae4..b295819bb0e 160000
--- a/shim
+++ b/shim
@@ -1 +1 @@
-Subproject commit cf6a954aae4bee7b4515e13475878460115027d1
+Subproject commit b295819bb0ec636b4e3359828e05476d2437650a
diff --git a/third-party/ao b/third-party/ao
index 01849b2b19c..02105d46c61 160000
--- a/third-party/ao
+++ b/third-party/ao
@@ -1 +1 @@
-Subproject commit 01849b2b19cb923cb739a1fc02297ba418ddf715
+Subproject commit 02105d46c61dc80a8c9d39d5836e827ba3af8439
diff --git a/third-party/pocketfft b/third-party/pocketfft
index 0fa0ef591e3..81874074463 160000
--- a/third-party/pocketfft
+++ b/third-party/pocketfft
@@ -1 +1 @@
-Subproject commit 0fa0ef591e38c2758e3184c6c23e497b9f732ffa
+Subproject commit 8187407446316c3d16f15e5395dabd4b22f4fec7

From 6a46467f0b879a69f50a3b80511275b88682f1db Mon Sep 17 00:00:00 2001
From: Suraj Raut <sraut@cadence.com>
Date: Fri, 29 May 2026 00:56:03 -0700
Subject: [PATCH 5/7] Sync remaining files with upstream/main

---
 backends/cuda/runtime/shims/tests/targets.bzl |  24 ++
 .../shims/tests/test_op__device_copy.cpp      | 195 ++++++++++++
 backends/mlx/builder/op_helpers.py            |   2 +-
 backends/mlx/patterns.py                      |  79 ++++-
 backends/mlx/test/test_ops.py                 |  14 +
 .../upsample_bilinear2d_converter.py          | 102 ++++--
 .../upsample_nearest2d_converter.py           | 110 +++++--
 .../test_convert_upsample_bilinear2d.py       | 283 ++++++++++++++++-
 .../test_convert_upsample_nearest2d.py        | 141 ++++++++-
 backends/transforms/aten_to_dialect_pass.py   | 138 +++++++++
 backends/transforms/targets.bzl               |  25 ++
 .../test/test_aten_to_dialect_pass.py         | 239 ++++++++++++++
 examples/models/gemma4_31b/README.md          |   1 +
 examples/models/gemma4_31b/export.py          |   7 +-
 examples/models/gemma4_31b/gguf_loader.py     |  19 +-
 examples/models/gemma4_31b/quant/README.md    |   2 -
 examples/models/gemma4_31b/quant/pack_mlx.py  |   6 +-
 .../gemma4_31b/quant/tests/test_pack_mlx.py   |  46 ++-
 .../gemma4_31b/tests/test_mlx_pipeline.py     |  79 +++++
 .../executor_runner/nxp_executor_runner.cpp   | 183 +++++------
 .../AsrModuleInstrumentationTest.kt           | 260 ++++++++++++++++
 .../executorch/LlmLoraInstrumentationTest.kt  | 291 ++++++++++++++++++
 shim                                          |   2 +-
 third-party/ao                                |   2 +-
 third-party/pocketfft                         |   2 +-
 25 files changed, 2070 insertions(+), 182 deletions(-)
 create mode 100644 backends/cuda/runtime/shims/tests/test_op__device_copy.cpp
 create mode 100644 backends/transforms/aten_to_dialect_pass.py
 create mode 100644 backends/transforms/test/test_aten_to_dialect_pass.py
 create mode 100644 extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/AsrModuleInstrumentationTest.kt
 create mode 100644 extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmLoraInstrumentationTest.kt

diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl
index b68043f7feb..a54c47e979d 100644
--- a/backends/cuda/runtime/shims/tests/targets.bzl
+++ b/backends/cuda/runtime/shims/tests/targets.bzl
@@ -42,3 +42,27 @@ def define_common_targets():
     cuda_shim_cpp_unittest("aoti_torch_new_tensor_handle")
     cuda_shim_cpp_unittest("aoti_torch_item_bool")
     cuda_shim_cpp_unittest("aoti_torch_assign_tensors_out")
+
+    cpp_unittest(
+        name = "test_op__device_copy",
+        srcs = ["test_op__device_copy.cpp"],
+        deps = [
+            "//executorch/backends/cuda/runtime:cuda_backend",
+            "//executorch/kernels/portable:generated_lib",
+            "//executorch/kernels/portable:generated_lib_headers",
+            "//executorch/kernels/portable/cpu:op__device_copy",
+            "//executorch/runtime/core:device_allocator",
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/portable_type:portable_type",
+            "//executorch/runtime/kernel:kernel_runtime_context",
+            "//executorch/runtime/platform:platform",
+        ],
+        external_deps = [
+            ("cuda", None, "cuda-lazy"),
+        ],
+        preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
+        keep_gpu_sections = True,
+        remote_execution = re_test_utils.remote_execution(
+            platform = "gpu-remote-execution",
+        ),
+    )
diff --git a/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp b/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp
new file mode 100644
index 00000000000..4e5c5a099b7
--- /dev/null
+++ b/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/kernels/portable/Functions.h>
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/portable_type/tensor_impl.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <gtest/gtest.h>
+
+#if (defined(__has_feature) && __has_feature(address_sanitizer)) || \
+    defined(__SANITIZE_ADDRESS__)
+#include <sanitizer/lsan_interface.h>
+#define EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE 1
+#else
+#define EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE 0
+#endif
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::runtime::Error;
+using executorch::runtime::get_device_allocator;
+using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::TensorShapeDynamism;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+namespace {
+
+struct CudaDeleter {
+  void operator()(void* ptr) const {
+    if (ptr != nullptr) {
+      cudaFree(ptr);
+    }
+  }
+};
+
+using CudaPtr = std::unique_ptr<void, CudaDeleter>;
+
+CudaPtr allocate_cuda(size_t nbytes) {
+  void* ptr = nullptr;
+  const cudaError_t err = cudaMalloc(&ptr, nbytes);
+  EXPECT_EQ(err, cudaSuccess) << "cudaMalloc failed";
+  return CudaPtr(ptr);
+}
+
+bool is_cuda_available() {
+#if EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE
+  __lsan_disable();
+#endif
+  int device_count = 0;
+  const cudaError_t err = cudaGetDeviceCount(&device_count);
+#if EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE
+  __lsan_enable();
+#endif
+  return err == cudaSuccess && device_count > 0;
+}
+
+std::vector<float> copy_cuda_to_host(const void* device_ptr, size_t numel) {
+  std::vector<float> host(numel);
+  const cudaError_t err = cudaMemcpy(
+      host.data(), device_ptr, numel * sizeof(float), cudaMemcpyDeviceToHost);
+  EXPECT_EQ(err, cudaSuccess) << "cudaMemcpy D2H failed";
+  return host;
+}
+
+void copy_host_to_cuda(const std::vector<float>& host, void* device_ptr) {
+  const cudaError_t err = cudaMemcpy(
+      device_ptr,
+      host.data(),
+      host.size() * sizeof(float),
+      cudaMemcpyHostToDevice);
+  EXPECT_EQ(err, cudaSuccess) << "cudaMemcpy H2D failed";
+}
+
+class CudaDeviceCopyOpTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    executorch::runtime::runtime_init();
+    ASSERT_NE(get_device_allocator(DeviceType::CUDA), nullptr)
+        << "Linking cuda_backend should auto-register the CUDA allocator";
+  }
+
+  void SetUp() override {
+    if (!is_cuda_available()) {
+      GTEST_SKIP() << "CUDA not available, skipping CUDA device copy op tests";
+    }
+  }
+
+  Tensor& op_h2d_copy_out(const Tensor& self, Tensor& out) {
+    return torch::executor::et_copy::_h2d_copy_outf(context_, self, out);
+  }
+
+  Tensor& op_d2h_copy_out(const Tensor& self, Tensor& out) {
+    return torch::executor::et_copy::_d2h_copy_outf(context_, self, out);
+  }
+
+  KernelRuntimeContext context_;
+};
+
+} // namespace
+
+TEST_F(CudaDeviceCopyOpTest, H2dCopyUsesRegisteredCudaAllocator) {
+  std::vector<float> src_data = {1.0f, 2.0f, 3.0f, 4.0f};
+  auto device_data = allocate_cuda(src_data.size() * sizeof(float));
+  ASSERT_NE(device_data.get(), nullptr);
+
+  int32_t sizes[] = {static_cast<int32_t>(src_data.size())};
+  uint8_t dim_order[] = {0};
+  int32_t strides[] = {1};
+
+  TensorImpl src_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      src_data.data(),
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor src(&src_impl);
+
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      device_data.get(),
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      0);
+  Tensor dst(&dst_impl);
+
+  Tensor& result = op_h2d_copy_out(src, dst);
+
+  EXPECT_EQ(context_.failure_state(), Error::Ok);
+  EXPECT_EQ(&result, &dst);
+  EXPECT_EQ(copy_cuda_to_host(device_data.get(), src_data.size()), src_data);
+}
+
+TEST_F(CudaDeviceCopyOpTest, D2hCopyUsesRegisteredCudaAllocator) {
+  const std::vector<float> expected = {5.0f, 6.0f, 7.0f, 8.0f};
+  auto device_data = allocate_cuda(expected.size() * sizeof(float));
+  ASSERT_NE(device_data.get(), nullptr);
+  copy_host_to_cuda(expected, device_data.get());
+
+  std::vector<float> dst_data(expected.size(), 0.0f);
+  int32_t sizes[] = {static_cast<int32_t>(expected.size())};
+  uint8_t dim_order[] = {0};
+  int32_t strides[] = {1};
+
+  TensorImpl src_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      device_data.get(),
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      0);
+  Tensor src(&src_impl);
+
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      dst_data.data(),
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor dst(&dst_impl);
+
+  Tensor& result = op_d2h_copy_out(src, dst);
+
+  EXPECT_EQ(context_.failure_state(), Error::Ok);
+  EXPECT_EQ(&result, &dst);
+  EXPECT_EQ(dst_data, expected);
+}
diff --git a/backends/mlx/builder/op_helpers.py b/backends/mlx/builder/op_helpers.py
index 40e71e0bdab..7740546cc2c 100644
--- a/backends/mlx/builder/op_helpers.py
+++ b/backends/mlx/builder/op_helpers.py
@@ -334,7 +334,7 @@ def parse_dequant_node(
     if len(non_one) != 1:
         return None
     quantized_dim, group_size = non_one[0]
-    if group_size not in [32, 64, 128]:
+    if group_size not in [16, 32, 64, 128]:
         return None
 
     # TODO: MLX supports 3, 5, and 7, but we need to figure out the
diff --git a/backends/mlx/patterns.py b/backends/mlx/patterns.py
index 29e5e326c69..5f74cbea643 100644
--- a/backends/mlx/patterns.py
+++ b/backends/mlx/patterns.py
@@ -15,6 +15,7 @@
 
 from __future__ import annotations
 
+import os
 from typing import Any, List, Optional, Tuple
 
 import torch
@@ -37,6 +38,7 @@
 )
 from executorch.backends.mlx.serialization.mlx_graph_schema import (
     AddIntNode,
+    AddmmNode,
     AddNode,
     AsTypeNode,
     DequantizeNode,
@@ -52,6 +54,7 @@
     SubtractIntNode,
     SymSizeNode,
     TakeNode,
+    TransposeNode,
 )
 from torch.export.exported_program import ExportedProgram
 from torch.fx.node import Node
@@ -883,6 +886,18 @@ def maybe_create(
             out_dtype=out_dtype,
         )
 
+    # MLX's quantized_matmul Metal kernels are only instantiated for
+    # group_size in {32, 64, 128}. For smaller group sizes (e.g. GGUF
+    # Q6_K with group_size=16), emit DequantizeNode + matmul instead.
+    # Weights stay packed in the .pte file; dequantized on-device.
+    # This non-fused path is significantly slower and must be opted in
+    # via ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS=1.
+    _MIN_FUSED_GROUP_SIZE = 32
+
+    @staticmethod
+    def _allow_non_fused() -> bool:
+        return os.environ.get("ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS", "0") == "1"
+
     def __call__(self, P: MLXProgramBuilder, n: Node) -> Slot:
         assert n == self.head
 
@@ -908,19 +923,59 @@ def __call__(self, P: MLXProgramBuilder, n: Node) -> Slot:
         x_dtype = x_node.meta["val"].dtype
         needs_cast = self.out_dtype != x_dtype
 
-        P.emit(
-            QuantizedMatmulNode(
-                x=P.slot_to_tid(x_slot),
-                w=P.slot_to_tid(w),
-                scales=P.slot_to_tid(scale_slot),
-                out=P.slot_to_tid(out),
-                biases=P.slot_to_tid(biases),
-                group_size=self.group_size,
-                bits=self.bits,
-                mode="affine",
-                transpose=True,
+        if self.group_size >= self._MIN_FUSED_GROUP_SIZE:
+            P.emit(
+                QuantizedMatmulNode(
+                    x=P.slot_to_tid(x_slot),
+                    w=P.slot_to_tid(w),
+                    scales=P.slot_to_tid(scale_slot),
+                    out=P.slot_to_tid(out),
+                    biases=P.slot_to_tid(biases),
+                    group_size=self.group_size,
+                    bits=self.bits,
+                    mode="affine",
+                    transpose=True,
+                )
             )
-        )
+        else:
+            if not self._allow_non_fused():
+                raise ValueError(
+                    f"Quantized linear with group_size={self.group_size} requires "
+                    f"the non-fused dequantize+matmul path, which is significantly "
+                    f"slower than the fused QuantizedMatmulNode (group_size >= 32). "
+                    f"Set ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS=1 to allow this."
+                )
+            out_scalar_type = torch_dtype_to_scalar_type(self.out_dtype)
+            _, w_deq = P.make_tmp_slot()
+            P.emit(
+                DequantizeNode(
+                    w=P.slot_to_tid(w),
+                    scales=P.slot_to_tid(scale_slot),
+                    out=P.slot_to_tid(w_deq),
+                    biases=P.slot_to_tid(biases),
+                    group_size=self.group_size,
+                    bits=self.bits,
+                    mode="affine",
+                    dtype=out_scalar_type,
+                )
+            )
+            _, w_t = P.make_tmp_slot()
+            P.emit(
+                TransposeNode(
+                    x=P.slot_to_tid(w_deq),
+                    out=P.slot_to_tid(w_t),
+                    perm=[1, 0],
+                )
+            )
+            P.emit(
+                AddmmNode(
+                    mat1=P.slot_to_tid(x_slot),
+                    mat2=P.slot_to_tid(w_t),
+                    out=P.slot_to_tid(out),
+                )
+            )
+            # DequantizeNode already produces the correct dtype.
+            needs_cast = False
 
         if has_bias:
             P.emit(
diff --git a/backends/mlx/test/test_ops.py b/backends/mlx/test/test_ops.py
index 4471610519e..45ea024f0e8 100644
--- a/backends/mlx/test/test_ops.py
+++ b/backends/mlx/test/test_ops.py
@@ -24,6 +24,7 @@
 See README.md in this directory for full documentation.
 """
 
+import os
 from typing import Callable, Dict, List, Optional, Tuple
 
 import torch
@@ -5621,8 +5622,21 @@ def get_test_configs(cls) -> List["QuantizedLinearTest"]:
             cls(group_size=128),
             cls(qdtype=torch.int2),
             cls(qdtype=torch.int8),
+            # group_size=16: exercises the non-fused dequantize+matmul path
+            # (requires ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS=1).
+            cls(qdtype=torch.int8, group_size=16),
+            cls(qdtype=torch.int4, group_size=16),
+            cls(qdtype=torch.int8, group_size=16, bias=False),
         ]
 
+    def generate_test_files(self, verbose=False):
+        if self.group_size < 32:
+            os.environ["ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS"] = "1"
+        try:
+            return super().generate_test_files(verbose=verbose)
+        finally:
+            os.environ.pop("ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS", None)
+
     def create_model(self) -> nn.Module:
         model = LinearModel(self.in_features, self.out_features, bias=self.bias)
         model = model.to(self.dtype)
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_bilinear2d_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_bilinear2d_converter.py
index 33d97dff642..1183ef494b5 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_bilinear2d_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_bilinear2d_converter.py
@@ -4,11 +4,13 @@
 # LICENSE file in the root directory of this source tree.
 
 import numpy as np
+import torch
 
 from executorch.backends.nxp.backend.data_format import DataFormat, NXP_NODE_FORMAT
 from executorch.backends.nxp.backend.edge_helper import node_has_well_defined_shape
 from executorch.backends.nxp.backend.ir.converter.node_converter import (
     CustomDelegationOptions,
+    is_not_qdq_node,
     NodeConverter,
 )
 from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.resize_bilinear_options import (
@@ -16,12 +18,35 @@
 )
 from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
+from torch.fx.passes.infra.partitioner import Partition
 from torch.nn import Parameter
 
 
 # noinspection SpellCheckingInspection
 class UpsampleBilinear2DConverter(NodeConverter):
 
+    @classmethod
+    def supports_partitioning_result(
+        cls,
+        node: Node,
+        partition_list: list[Partition],
+        custom_delegation_options: CustomDelegationOptions,
+        neutron_target_spec: NeutronTargetSpec,
+        parameters_mapping: dict[str, Parameter],
+    ) -> bool:
+        input_shape = node.all_input_nodes[0].meta["val"].shape
+        output_shape = node.meta["val"].shape
+        is_alone_in_partition = cls.is_node_alone_in_partition(
+            node, partition_list, filter_fn=is_not_qdq_node
+        )
+
+        if is_alone_in_partition and input_shape == output_shape:
+            # The operator is a no-op, so the Neutron Converter will skip it. If it's the only node in the
+            #  partition, the graph would end up empty.
+            return False
+
+        return True
+
     @staticmethod
     def _is_supported_in_IR(
         node: Node,
@@ -36,6 +61,14 @@ def _is_supported_in_IR(
                 " format. Please report this."
             )
 
+        # The conversion requires the output shape to be known and static.
+        if not node_has_well_defined_shape(node):
+            return False
+
+        if len(node.meta["val"].shape) != 4:
+            # Unexpected case. The input should always be 4D.
+            return False
+
         return True
 
     @staticmethod
@@ -45,38 +78,58 @@ def _is_supported_on_target(
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        # Neutron requires static shapes.
-        #  neutron-converter/src/OperatorC/UpsamplePlugin.cpp?at=NEUTRON_SOFTWARE_2.2.3#74
-        if not node_has_well_defined_shape(node):
-            return False
-
-        if len(node.meta["val"].shape) != 4:
-            # Unexpected case. The input should always be 4D.
-            return False
-
-        # The tensors here use the channels first format (NCHW).
+        # The tensors are always 4D and use the channels first format (NCHW).
         _, in_c, in_h, in_w = node.all_input_nodes[0].meta["val"].shape
         _, _, out_h, out_w = node.meta["val"].shape
 
-        # Neutron supports only the doubling and quadrupleing of both height and width at the same time.
-        #  neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#778
-        supported_scales = [2, 4]
-        if not any(
-            in_h * scale == out_h and in_w * scale == out_w
-            for scale in supported_scales
-        ):
-            return False
-
-        # Neutron requires the input channels to be a multiple of `num_macs`.
-        #  neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#777
-        if in_c % neutron_target_spec.get_num_macs() != 0:
-            return False
+        if custom_delegation_options.use_new_flow_neutron_c:
+            # Requirements specified by the new Neutron flow documentation.
+
+            if not NodeConverter.uses_quantization_type_for_io(
+                node,
+                supported_types=[torch.int8, torch.uint8],
+                input_indices=[0],
+                output_indices=[0],
+            ):
+                return False
+
+            supported_scales = [1, 2, 4, 8]
+            align_corners = node.args[2]
+            if align_corners:
+                if in_h == 1 or in_w == 1:
+                    return False  # Avoid division by 0.
+                h_scale = (out_h - 1) / (in_h - 1)
+                w_scale = (out_w - 1) / (in_w - 1)
+            else:
+                h_scale = out_h / in_h
+                w_scale = out_w / in_w
+
+            # The H and W scales don't need to be equal, but both must be supported.
+            if (h_scale not in supported_scales) or (w_scale not in supported_scales):
+                return False
+
+        else:
+            # Requirements of the old Neutron flow.
+
+            # Neutron supports only the doubling and quadrupleing of both height and width at the same time.
+            #  neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#778
+            supported_scales = [2, 4]
+            if not any(
+                in_h * scale == out_h and in_w * scale == out_w
+                for scale in supported_scales
+            ):
+                return False
+
+            # Neutron requires the input channels to be a multiple of `num_macs`.
+            #  neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#777
+            if in_c % neutron_target_spec.get_num_macs() != 0:
+                return False
 
         return True
 
     def convert(self, node: Node):
         """Convert the `aten.upsample_bilinear2d.vec` operator to Neutron IR `ResizeBilinear`.
-        The schema is:
+        The ExecuTorch schema is:
         aten::upsample_bilinear2d.vec(
             Tensor input,
             SymInt[]? output_size,
@@ -109,6 +162,7 @@ def convert(self, node: Node):
         #  and the second one is what NeutronIR uses when `align_corners == False and half_pixel_centers == True`.
         # https://github.com/tensorflow/tensorflow/blob/v2.20.0/tensorflow/lite/kernels/internal/reference/resize_bilinear.h#L82-L88
         # https://github.com/tensorflow/tensorflow/blob/v2.20.0/tensorflow/lite/kernels/internal/reference/resize_bilinear.h#L172-L180
+        # Also, the new Neutron flow requires that `align_corners` and `half_pixel_centers` are not True simultainiously.
         align_corners = node.args[2]
         half_pixel_centers = not align_corners
         t_op.builtin_options = ResizeBilinear(align_corners, half_pixel_centers)
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_nearest2d_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_nearest2d_converter.py
index 1ddc71425ef..6e18a7bfe67 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_nearest2d_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_nearest2d_converter.py
@@ -4,11 +4,13 @@
 # LICENSE file in the root directory of this source tree.
 
 import numpy as np
+import torch
 
 from executorch.backends.nxp.backend.data_format import DataFormat, NXP_NODE_FORMAT
 from executorch.backends.nxp.backend.edge_helper import node_has_well_defined_shape
 from executorch.backends.nxp.backend.ir.converter.node_converter import (
     CustomDelegationOptions,
+    is_not_qdq_node,
     NodeConverter,
 )
 from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.resize_nearest_neighbor_options import (
@@ -16,12 +18,37 @@
 )
 from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
+from torch.fx.passes.infra.partitioner import Partition
 from torch.nn import Parameter
 
+HeightScale = float
+WidthScale = float
+
 
 # noinspection SpellCheckingInspection
 class UpsampleNearest2DConverter(NodeConverter):
 
+    @classmethod
+    def supports_partitioning_result(
+        cls,
+        node: Node,
+        partition_list: list[Partition],
+        custom_delegation_options: CustomDelegationOptions,
+        neutron_target_spec: NeutronTargetSpec,
+        parameters_mapping: dict[str, Parameter],
+    ) -> bool:
+        h_scale, w_scale = cls._get_effective_scales(node)
+        is_alone_in_partition = cls.is_node_alone_in_partition(
+            node, partition_list, filter_fn=is_not_qdq_node
+        )
+
+        if is_alone_in_partition and h_scale == w_scale == 1:
+            # The operator is a no-op, so the Neutron Converter will skip it. If it's the only node in the
+            #  partition, the graph would end up empty.
+            return False
+
+        return True
+
     @staticmethod
     def _is_supported_in_IR(
         node: Node,
@@ -36,6 +63,14 @@ def _is_supported_in_IR(
                 " format. Please report this."
             )
 
+        # The conversion requires the output shape to be known and static.
+        if not node_has_well_defined_shape(node):
+            return False
+
+        if len(node.meta["val"].shape) != 4:
+            # Unexpected case. The input should always be 4D.
+            return False
+
         return True
 
     @staticmethod
@@ -45,39 +80,62 @@ def _is_supported_on_target(
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        # Neutron requires static shapes.
-        #  neutron-converter/src/OperatorC/UpsamplePlugin.cpp?at=NEUTRON_SOFTWARE_2.2.3#74
-        if not node_has_well_defined_shape(node):
-            return False
-
-        if len(node.meta["val"].shape) != 4:
-            # Unexpected case. The input should always be 4D.
-            return False
-
-        # The tensors here use the channels first format (NCHW).
+        # The tensors are always 4D and use the channels first format (NCHW).
         _, in_c, in_h, in_w = node.all_input_nodes[0].meta["val"].shape
         _, _, out_h, out_w = node.meta["val"].shape
 
-        # Neutron supports only the doubling and quadrupleing of both height and width at the same time.
-        #  neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#768
-        #  neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#778
-        supported_scales = [2, 4]
-        if not any(
-            in_h * scale == out_h and in_w * scale == out_w
-            for scale in supported_scales
-        ):
-            return False
-
-        # Neutron requires the input channels to be a multiple of `num_macs`.
-        #  neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#767
-        if in_c % neutron_target_spec.get_num_macs() != 0:
-            return False
+        if custom_delegation_options.use_new_flow_neutron_c:
+            # Requirements specified by the new Neutron flow documentation.
+
+            if not NodeConverter.uses_quantization_type_for_io(
+                node,
+                supported_types=[torch.int8, torch.uint8],
+                input_indices=[0],
+                output_indices=[0],
+            ):
+                return False
+
+            supported_scales = [1, 2, 4, 8]
+            h_scale, w_scale = UpsampleNearest2DConverter._get_effective_scales(node)
+            # The H and W scales don't need to be equal but both must be supported.
+            if (h_scale not in supported_scales) or (w_scale not in supported_scales):
+                return False
+
+        else:
+            # Requirements of the old Neutron flow.
+
+            # Neutron supports only the doubling and quadrupleing of both height and width at the same time.
+            #  neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#768
+            #  neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#778
+            supported_scales = [2, 4]
+            if not any(
+                in_h * scale == out_h and in_w * scale == out_w
+                for scale in supported_scales
+            ):
+                return False
+
+            # Neutron requires the input channels to be a multiple of `num_macs`.
+            #  neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#767
+            if in_c % neutron_target_spec.get_num_macs() != 0:
+                return False
 
         return True
 
+    @staticmethod
+    def _get_effective_scales(node: Node) -> tuple[HeightScale, WidthScale]:
+        # Neutron supports variants where `align_corners=False` and `align_corners=True`. ExecuTorch doesn't have this
+        #  parameter. Its behavior is equivalent to `align_corners=False`. Hence, the scale calculation corresponds to
+        #  the `align_corners=False` case in the Neutron documentation.
+        _, _, in_h, in_w = node.all_input_nodes[0].meta["val"].shape
+        _, _, out_h, out_w = node.meta["val"].shape
+        h_scale = out_h / in_h
+        w_scale = out_w / in_w
+
+        return h_scale, w_scale
+
     def convert(self, node: Node):
         """Convert the `aten.upsample_nearest2d.vec` operator to Neutron IR `ResizeNearestNeighbor`.
-        The schema is:
+        The ExecuTorch schema is:
             aten::upsample_nearest2d.vec(
                 Tensor input,
                 SymInt[]? output_size,
@@ -90,6 +148,8 @@ def convert(self, node: Node):
         x = t_op.tmp_inputs[0]
         y = t_op.tmp_outputs[0]
 
+        # Neutron supports variants where `align_corners=False` and `align_corners=True`. ExecuTorch doesn't have this
+        #  parameter. Its behavior is equivalent to `align_corners=False` and `half_pixel_centers=False`.
         t_op.builtin_options = ResizeNearestNeighbor(False, False)
 
         # The `aten.upsample_nearest2d` can use either the `size` attribute or the `scale_factor` to define the output
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_bilinear2d.py b/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_bilinear2d.py
index 5663eea9cc3..2d2f9845fa3 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_bilinear2d.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_bilinear2d.py
@@ -4,12 +4,15 @@
 # LICENSE file in the root directory of this source tree.
 
 import numpy as np
+
+# noinspection PyUnusedImports
 import pytest
 import torch
 
 from executorch.backends.nxp.backend.edge_program_converter import (
     EdgeProgramToIRConverter,
 )
+from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator
 from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
 from executorch.backends.nxp.tests.executors import (
     convert_run_compare,
@@ -17,7 +20,17 @@
     ToChannelFirstPreprocess,
     ToChannelLastPreprocess,
 )
-from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier
+from executorch.backends.nxp.tests.model_output_comparator import (
+    AllCloseOutputComparator,
+)
+from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
+from executorch.backends.nxp.tests.ops_aliases import (
+    AddTensor,
+    ExecutorchDelegateCall,
+    UpsampleBilinear2D,
+)
+from executorch.backends.nxp.tests.use_qat import *  # noqa F403
 
 
 @pytest.fixture(autouse=True)
@@ -26,23 +39,25 @@ def reseed_model_per_test_run():
     np.random.seed(23)
 
 
-# noinspection PyProtectedMember
-ExecutorchDelegateCall = torch.ops.higher_order.executorch_call_delegate
-UpsampleBilinear2D = exir_ops.edge.aten.upsample_bilinear2d.vec
-
-
 class UpsampleBilinearModule(torch.nn.Module):
 
-    def __init__(self, size=None, scale=None):
+    def __init__(self, size=None, scale=None, **kwargs):
         super().__init__()
         self.upsample = torch.nn.Upsample(
-            size=size, scale_factor=scale, mode="bilinear"
+            size=size, scale_factor=scale, mode="bilinear", **kwargs
         )
 
     def forward(self, x):
         return self.upsample(x)
 
 
+class UpsampleBilinearAddModule(UpsampleBilinearModule):
+
+    def forward(self, x):
+        x = super().forward(x)
+        return x + x
+
+
 @pytest.mark.parametrize(
     "input_shape, size",
     [
@@ -185,3 +200,255 @@ def test_convert_upsample_bilinear2d__no_delegation__unsupported_size(
     # Make sure the `upsample` was NOT delegated (size != double of input).
     assert not graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall])
     assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleBilinear2D])
+
+
+class TestUpsampleBilinear2DNewNeutronFlow:
+    # TODO Use quantized dataset and `atol=1` in the tests.
+
+    # noinspection PyMethodMayBeStatic
+    def assert_delegated(
+        self,
+        model,
+        input_shape,
+        mocker,
+        use_qat=False,
+        atol=None,
+        expected_delegated_ops=None,
+    ):
+        if expected_delegated_ops is None:
+            expected_delegated_ops = {UpsampleBilinear2D: 1}
+
+        graph_verifier = DetailedGraphVerifier(
+            mocker,
+            expected_delegated_ops=expected_delegated_ops,
+            expected_non_delegated_ops={},
+        )
+
+        # Cover also negative values to thoroughly test the operator.
+        dataset_creator = RandomDatasetCreator(low=-2, high=2)
+
+        kwargs = {"atol": atol} if atol is not None else {}
+        output_comparator = AllCloseOutputComparator(**kwargs)
+
+        lower_run_compare(
+            model,
+            input_shape,
+            graph_verifier,
+            dataset_creator,
+            output_comparator,
+            use_qat=use_qat,
+            use_new_flow_neutron_c=True,  # Use the new flow.
+        )
+
+    # noinspection PyMethodMayBeStatic
+    def assert_not_delegated(self, model, input_shape):
+        delegated_ep = to_quantized_edge_program(
+            model, input_shape, use_new_flow_neutron_c=True
+        ).exported_program()
+
+        assert not graph_contains_any_of_ops(
+            delegated_ep.graph, [ExecutorchDelegateCall]
+        )
+        assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleBilinear2D])
+
+    def test__qat__align_corners(self, mocker, use_qat):
+        align_corners = True
+        input_shape = (1, 2, 3, 4)
+        output_size = (5, 7)
+        model = UpsampleBilinearModule(size=output_size, align_corners=align_corners)
+        atol = 0.015  # ~= output scale -> single bit error.
+        self.assert_delegated(model, input_shape, mocker, use_qat=use_qat, atol=atol)
+
+    def test__qat__not_align_corners(self, mocker, use_qat):
+        align_corners = False
+        input_shape = (1, 2, 3, 4)
+        output_size = (6, 8)
+        model = UpsampleBilinearModule(size=output_size, align_corners=align_corners)
+        atol = 0.015  # ~= output scale -> single bit error.
+        self.assert_delegated(model, input_shape, mocker, use_qat=use_qat, atol=atol)
+
+    @pytest.mark.parametrize(
+        "input_shape, output_size",
+        [
+            pytest.param((1, 2, 3, 4), (6, 8), id="batch=1, scale_h=scale_w=2"),
+            pytest.param(
+                (3, 3, 3, 5),
+                (6, 5),
+                id="batch=3, scale_h=2, scale_w=1 (no num_macs multiples)",
+            ),
+            pytest.param((2, 2, 3, 4), (3, 16), id="batch=2, scale_h=1, scale_w=4"),
+            pytest.param((2, 2, 3, 4), (24, 8), id="batch=2, scale_h=8, scale_w=2"),
+        ],
+    )
+    def test__not_align_corners__output_size(self, mocker, input_shape, output_size):
+        align_corners = False
+        model = UpsampleBilinearModule(size=output_size, align_corners=align_corners)
+        atol = 0.016  # ~= output scale -> single bit error.
+        self.assert_delegated(model, input_shape, mocker, atol=atol)
+
+    def test__not_align_corners__output_size__unsupported(self):
+        align_corners = False
+        input_shape = (1, 2, 3, 4)
+        output_size = (9, 12)  # scale = (3, 3)
+        model = UpsampleBilinearModule(size=output_size, align_corners=align_corners)
+        self.assert_not_delegated(model, input_shape)
+
+    @pytest.mark.parametrize(
+        "input_shape, scale",
+        [
+            pytest.param((1, 2, 3, 4), (2, 2), id="batch=1, scale_h=scale_w=2"),
+            pytest.param(
+                (3, 3, 3, 5),
+                (2, 1),
+                id="batch=3, scale_h=2, scale_w=1 (no num_macs multiples)",
+            ),
+            pytest.param((2, 2, 3, 4), (4, 1), id="batch=2, scale_h=4, scale_w=1"),
+            pytest.param((2, 2, 3, 4), (2, 8), id="batch=2, scale_h=2, scale_w=8"),
+        ],
+    )
+    def test__not_align_corners__scales(self, mocker, input_shape, scale):
+        align_corners = False
+        model = UpsampleBilinearModule(scale=scale, align_corners=align_corners)
+        atol = 0.016  # ~= output scale -> single bit error.
+        self.assert_delegated(model, input_shape, mocker, atol=atol)
+
+    def test__not_align_corners__scales__unsupported(self):
+        align_corners = False
+        input_shape = (1, 2, 3, 4)
+        scale = (3, 3)
+        model = UpsampleBilinearModule(scale=scale, align_corners=align_corners)
+        self.assert_not_delegated(model, input_shape)
+
+    @pytest.mark.parametrize(
+        "input_shape, output_size",
+        [
+            pytest.param((1, 2, 4, 5), (7, 9), id="batch=1, scale_h=scale_w=2"),
+            pytest.param(
+                (1, 3, 3, 5),
+                (5, 5),
+                id="batch=1, scale_h=2, scale_w=1 (no num_macs multiples)",
+            ),
+            pytest.param((2, 2, 4, 5), (4, 17), id="batch=2, scale_h=1, scale_w=4"),
+            pytest.param((1, 2, 4, 5), (25, 9), id="batch=1, scale_h=8, scale_w=2"),
+        ],
+    )
+    def test__align_corners__output_size(self, mocker, input_shape, output_size):
+        align_corners = True
+        model = UpsampleBilinearModule(size=output_size, align_corners=align_corners)
+        atol = 0.016  # ~= output scale -> single bit error.
+        self.assert_delegated(model, input_shape, mocker, atol=atol)
+
+    @pytest.mark.parametrize(
+        "input_shape, output_size",
+        [
+            pytest.param(
+                (2, 2, 4, 5), (25, 9), id="batch=2, scale_h=8, scale_w=2"
+            ),  # Error ~= 0.47
+            pytest.param(
+                (3, 3, 3, 5),
+                (5, 5),
+                id="batch=3, scale_h=2, scale_w=1 (no num_macs multiples)",
+            ),  # Error ~= 3.7
+        ],
+    )
+    def test__align_corners__output_size__incorrect_output(
+        self, mocker, input_shape, output_size
+    ):
+        align_corners = True
+        model = UpsampleBilinearModule(size=output_size, align_corners=align_corners)
+        atol = 0.45  # Huge tolerance (still not enough to pass).
+        with pytest.raises(AssertionError):
+            self.assert_delegated(model, input_shape, mocker, atol=atol)
+
+    def test__align_corners__output_size__unsupported(self):
+        align_corners = True
+        input_shape = (1, 2, 3, 4)
+        output_size = (6, 8)  # Neutron scale = (5/2, 7/3)
+        model = UpsampleBilinearModule(size=output_size, align_corners=align_corners)
+        self.assert_not_delegated(model, input_shape)
+
+    def test__align_corners__output_size__input_size_equal_to_one(self):
+        align_corners = True
+        input_shape = (1, 2, 1, 1)  # Neutron scale computation would divide by zero.
+        output_size = (2, 2)
+        model = UpsampleBilinearModule(size=output_size, align_corners=align_corners)
+        self.assert_not_delegated(model, input_shape)
+
+    @pytest.mark.parametrize(
+        "input_shape, scale",
+        [
+            # The PyTorch scales are "weird" because the "Neutron scales" are computed differently.
+            # The fractions correspond to "nice" Neutron scales (1, 2, 4, or 8).
+            pytest.param(
+                (1, 2, 4, 5),
+                (7 / 4, 9 / 5),
+                id="batch=1, scale_h=7/4, scale_w=9/5 (Neutron scales = (2, 2)",
+            ),
+            pytest.param(
+                (1, 3, 3, 5),
+                (5 / 3, 1),
+                id="batch=1, scale_h=5/3, scale_w=1 (Neutron scales = (2, 1))",
+            ),
+            pytest.param(
+                (2, 2, 4, 5),
+                (1, 17 / 5),
+                id="batch=2, scale_h=1, scale_w=17/5 (Neutron scales = (1, 4))",
+            ),
+            pytest.param(
+                (1, 2, 4, 5),
+                (25 / 4, 9 / 5),
+                id="batch=1, scale_h=25/4, scale_w=9/5 (Neutron scales = (8, 2))",
+            ),
+        ],
+    )
+    def test__align_corners__scales(self, mocker, input_shape, scale):
+        align_corners = True
+        model = UpsampleBilinearModule(scale=scale, align_corners=align_corners)
+        atol = 0.016  # ~= output scale -> single bit error.
+        self.assert_delegated(model, input_shape, mocker, atol=atol)
+
+    @pytest.mark.parametrize(
+        "input_shape, scale",
+        [
+            pytest.param(
+                (2, 2, 4, 5),
+                (25 / 4, 9 / 5),
+                id="batch=3, scale_h=25/4, scale_w=9/5 (Neutron scales = (8, 2))",
+            ),  # Error ~= 0.47
+            pytest.param(
+                (3, 3, 3, 5),
+                (5 / 3, 1),
+                id="batch=3, scale_h=5/3, scale_w=1 (Neutron scales = (2, 1))",
+            ),  # Error ~= 3.7
+        ],
+    )
+    def test__align_corners__scales__incorrect_output(self, mocker, input_shape, scale):
+        align_corners = True
+        model = UpsampleBilinearModule(scale=scale, align_corners=align_corners)
+        atol = 0.45  # Huge tolerance (still not enough to pass).
+        with pytest.raises(AssertionError):
+            self.assert_delegated(model, input_shape, mocker, atol=atol)
+
+    def test__align_corners__scales__unsupported(self):
+        align_corners = True
+        input_shape = (1, 2, 3, 4)
+        scale = (2, 2)  # Neutron scale = (5/2, 7/3)
+        model = UpsampleBilinearModule(scale=scale, align_corners=align_corners)
+        self.assert_not_delegated(model, input_shape)
+
+    def test__noop__alone_in_partition__not_delegated(self):
+        input_shape = (1, 2, 3, 4)
+        scale = 1
+        model = UpsampleBilinearModule(scale=scale)
+        self.assert_not_delegated(model, input_shape)
+
+    def test__noop__not_alone_in_partition__delegated(self, mocker):
+        input_shape = (1, 2, 3, 4)
+        scale = 1
+        model = UpsampleBilinearAddModule(scale=scale)
+        self.assert_delegated(
+            model,
+            input_shape,
+            mocker,
+            expected_delegated_ops={UpsampleBilinear2D: 1, AddTensor: 1},
+        )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_nearest2d.py b/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_nearest2d.py
index 3d9ec84dec9..27d1ac718a0 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_nearest2d.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_nearest2d.py
@@ -4,12 +4,15 @@
 # LICENSE file in the root directory of this source tree.
 
 import numpy as np
+
+# noinspection PyUnusedImports
 import pytest
 import torch
 
 from executorch.backends.nxp.backend.edge_program_converter import (
     EdgeProgramToIRConverter,
 )
+from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator
 from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
 from executorch.backends.nxp.tests.executors import (
     convert_run_compare,
@@ -17,7 +20,14 @@
     ToChannelFirstPreprocess,
     ToChannelLastPreprocess,
 )
-from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier
+from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
+from executorch.backends.nxp.tests.ops_aliases import (
+    AddTensor,
+    ExecutorchDelegateCall,
+    UpsampleNearest2D,
+)
+from executorch.backends.nxp.tests.use_qat import *  # noqa F403
 
 
 @pytest.fixture(autouse=True)
@@ -26,11 +36,6 @@ def reseed_model_per_test_run():
     np.random.seed(23)
 
 
-# noinspection PyProtectedMember
-ExecutorchDelegateCall = torch.ops.higher_order.executorch_call_delegate
-UpsampleNearest2D = exir_ops.edge.aten.upsample_nearest2d.vec
-
-
 class UpsampleNearestModule(torch.nn.Module):
 
     def __init__(self, size=None, scale=None):
@@ -41,6 +46,13 @@ def forward(self, x):
         return self.upsample(x)
 
 
+class UpsampleNearestAddModule(UpsampleNearestModule):
+
+    def forward(self, x):
+        x = super().forward(x)
+        return x + x
+
+
 @pytest.mark.parametrize(
     "input_shape, size",
     [
@@ -181,3 +193,120 @@ def test_convert_upsample_nearest2d__no_delegation__unsupported_size(input_shape
     # Make sure the `upsample` was NOT delegated (size != double of input).
     assert not graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall])
     assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleNearest2D])
+
+
+class TestUpsampleNearest2DNewNeutronFlow:
+
+    # noinspection PyMethodMayBeStatic
+    def assert_delegated(
+        self,
+        model,
+        input_shape,
+        mocker,
+        use_qat=False,
+        expected_delegated_ops=None,
+    ):
+        if expected_delegated_ops is None:
+            expected_delegated_ops = {UpsampleNearest2D: 1}
+
+        graph_verifier = DetailedGraphVerifier(
+            mocker,
+            expected_delegated_ops=expected_delegated_ops,
+            expected_non_delegated_ops={},
+        )
+
+        # Cover also negative values to thoroughly test the operator.
+        dataset_creator = RandomDatasetCreator(low=-2, high=2)
+
+        lower_run_compare(
+            model,
+            input_shape,
+            graph_verifier,
+            dataset_creator,
+            use_qat=use_qat,
+            use_new_flow_neutron_c=True,  # Use the new flow.
+        )
+
+    # noinspection PyMethodMayBeStatic
+    def assert_not_delegated(self, model, input_shape):
+        delegated_ep = to_quantized_edge_program(
+            model, input_shape, use_new_flow_neutron_c=True
+        ).exported_program()
+
+        assert not graph_contains_any_of_ops(
+            delegated_ep.graph, [ExecutorchDelegateCall]
+        )
+        assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleNearest2D])
+
+    def test__qat(self, mocker, use_qat):
+        input_shape = (1, 2, 3, 4)
+        output_size = (6, 8)
+        model = UpsampleNearestModule(size=output_size)
+        self.assert_delegated(model, input_shape, mocker, use_qat=use_qat)
+
+    @pytest.mark.parametrize(
+        "input_shape, output_size",
+        [
+            pytest.param((1, 2, 3, 4), (6, 8), id="batch=1, scale_h=scale_w=2"),
+            pytest.param((1, 2, 3, 3), 6, id="batch=1, scale_h=scale_w=2, scalar size"),
+            pytest.param(
+                (3, 3, 3, 5),
+                (6, 5),
+                id="batch=3, scale_h=2, scale_w=1 (no num_macs multiples)",
+            ),
+            pytest.param((2, 2, 3, 4), (3, 16), id="batch=2, scale_h=1, scale_w=4"),
+            pytest.param((2, 2, 3, 4), (24, 8), id="batch=2, scale_h=8, scale_w=2"),
+        ],
+    )
+    def test__output_size(self, mocker, input_shape, output_size):
+        model = UpsampleNearestModule(size=output_size)
+        self.assert_delegated(model, input_shape, mocker)
+
+    def test__output_size__unsupported(self):
+        input_shape = (1, 2, 3, 4)
+        output_size = (9, 12)  # scale = (3, 3)
+        model = UpsampleNearestModule(size=output_size)
+        self.assert_not_delegated(model, input_shape)
+
+    @pytest.mark.parametrize(
+        "input_shape, scale",
+        [
+            pytest.param((1, 2, 3, 4), (2, 2), id="batch=1, scale_h=scale_w=2"),
+            pytest.param(
+                (1, 2, 3, 4), 4, id="batch=1, scale_h=scale_w=4, scalar scale"
+            ),
+            pytest.param(
+                (3, 3, 3, 5),
+                (2, 1),
+                id="batch=3, scale_h=2, scale_w=1 (no num_macs multiples)",
+            ),
+            pytest.param((2, 2, 3, 4), (4, 1), id="batch=2, scale_h=4, scale_w=1"),
+            pytest.param((2, 2, 3, 4), (2, 8), id="batch=2, scale_h=2, scale_w=8"),
+        ],
+    )
+    def test__scales(self, mocker, input_shape, scale):
+        model = UpsampleNearestModule(scale=scale)
+        self.assert_delegated(model, input_shape, mocker)
+
+    def test__scales__unsupported(self):
+        input_shape = (1, 2, 3, 4)
+        scale = (3, 3)
+        model = UpsampleNearestModule(scale=scale)
+        self.assert_not_delegated(model, input_shape)
+
+    def test__noop__alone_in_partition__not_delegated(self):
+        input_shape = (1, 2, 3, 4)
+        scale = 1
+        model = UpsampleNearestModule(scale=scale)
+        self.assert_not_delegated(model, input_shape)
+
+    def test__noop__not_alone_in_partition__delegated(self, mocker):
+        input_shape = (1, 2, 3, 4)
+        scale = 1
+        model = UpsampleNearestAddModule(scale=scale)
+        self.assert_delegated(
+            model,
+            input_shape,
+            mocker,
+            expected_delegated_ops={UpsampleNearest2D: 1, AddTensor: 1},
+        )
diff --git a/backends/transforms/aten_to_dialect_pass.py b/backends/transforms/aten_to_dialect_pass.py
new file mode 100644
index 00000000000..f31df73bc58
--- /dev/null
+++ b/backends/transforms/aten_to_dialect_pass.py
@@ -0,0 +1,138 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import traceback
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import ClassVar, TypeAlias
+
+import torch
+
+from executorch.backends.xnnpack._passes.xnnpack_pass import ExportPass
+
+from executorch.exir import ExportedProgram
+from torch.fx.node import Target
+from torch.fx.passes.infra.pass_manager import PassResult
+
+
+# Expected type to be returned by substitution functions.
+@dataclass
+class DialectNodeSpec:
+    op: Target
+    args: tuple
+    kwargs: dict = None
+
+
+# Expected type to be used for substitution functions
+SubstitutionFn: TypeAlias = Callable[
+    [torch.fx.Node, torch.export.ExportedProgram], DialectNodeSpec | None
+]
+
+
+class AtenToDialectPass(ExportPass):
+    """
+    General pass to convert ops 1-1 from ATen to a specific dialect.
+
+    Usage:
+        1. Subclass the pass for a specific dialect
+        2. For each ATen target to be substituted, implement a function returning a DialectNodeSpec defining the
+           corresponding dialect op, or None if the substitution does not apply.
+        3. Register each substitution function for the subclass using the decorator register_dialect_substitution
+
+    Only one substitution function can be registered for a given target.
+
+    The pass must be initialized with an exported_program to allow substitution functions to modify placeholders,
+    e.g. if the dialect ops require additional scratch buffers.
+    """
+
+    _DIALECT_SUBSTITUTIONS: ClassVar[dict[Target, SubstitutionFn]] = {}
+
+    def __init__(self, exported_program: ExportedProgram):
+        super().__init__()
+        self.exported_program: ExportedProgram = exported_program
+
+    # Ensure each subclass has its own substitution registry.
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        cls._DIALECT_SUBSTITUTIONS = {}
+
+    @classmethod
+    def register_dialect_substitution(
+        cls, target: Target
+    ) -> Callable[[SubstitutionFn], SubstitutionFn]:
+
+        def decorator(func: SubstitutionFn) -> SubstitutionFn:
+            if target in cls._DIALECT_SUBSTITUTIONS:
+                raise RuntimeError(
+                    f"Multiple substitutions registered for the same target in {cls.__name__} are not allowed."
+                )
+            else:
+                cls._DIALECT_SUBSTITUTIONS[target] = func
+            return func
+
+        return decorator
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        modified = False
+
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function":
+                continue
+
+            substitution_func = self._DIALECT_SUBSTITUTIONS.get(node.target, None)
+            if substitution_func is None:
+                continue
+
+            dialect_node_spec = substitution_func(node, self.exported_program)
+            if dialect_node_spec is None:
+                continue
+
+            modified = True
+            with graph_module.graph.inserting_before(node):
+                dialect_node = graph_module.graph.create_node(
+                    "call_function",
+                    target=dialect_node_spec.op,
+                    args=dialect_node_spec.args,
+                    kwargs=dialect_node_spec.kwargs or {},
+                )
+
+                node.replace_all_uses_with(dialect_node)
+
+                # Keep same meta dict for new node and append new trace
+                dialect_node.meta = node.meta
+                old_stack_trace = dialect_node.meta.get("stack_trace", "")
+                dialect_node.meta["stack_trace"] = (
+                    f"{old_stack_trace}\n{traceback.format_stack()[-2]}"
+                )
+
+                graph_module.graph.erase_node(node)
+
+        if modified:
+            graph_module.graph.eliminate_dead_code()
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, modified)
+
+    def requires(self, graph_module):
+        self.ops_before = sum(
+            1 for node in graph_module.graph.nodes if node.op == "call_function"
+        )
+        return super().requires(graph_module)
+
+    def ensures(self, graph_module: torch.fx.GraphModule) -> bool:
+        """Ensure that there has only been 1-1 substitution of call_function nodes, i.e. that the number of call_function nodes is preserved after the pass."""
+
+        self.ops_after = sum(
+            1 for node in graph_module.graph.nodes if node.op == "call_function"
+        )
+        if self.ops_after != self.ops_before:
+            raise RuntimeError(
+                f"{self.__class__.__name__} did not preserve the number of call_function nodes: "
+                f"before={self.ops_before}, after={self.ops_after}"
+            )
+
+        return super().ensures(graph_module)
diff --git a/backends/transforms/targets.bzl b/backends/transforms/targets.bzl
index 8c3603e293d..36466ec4aa0 100644
--- a/backends/transforms/targets.bzl
+++ b/backends/transforms/targets.bzl
@@ -176,6 +176,21 @@ def define_common_targets():
         ],
     )
 
+    runtime.python_library(
+        name = "aten_to_dialect_pass",
+        srcs = [
+            "aten_to_dialect_pass.py",
+        ],
+        visibility = [
+            "//executorch/backends/...",
+        ],
+        deps = [
+            "//caffe2:torch",
+            "//executorch/backends/xnnpack/_passes:xnnpack_passes",
+            "//executorch/exir:lib",
+        ],
+    )
+
     runtime.python_library(
         name = "rank_0_to_rank_1",
         srcs = [
@@ -243,6 +258,16 @@ def define_common_targets():
         ],
     )
 
+    runtime.python_test(
+        name = "test_aten_to_dialect_pass",
+        srcs = [
+            "test/test_aten_to_dialect_pass.py",
+        ],
+        deps = [
+            "//caffe2:torch",
+            ":aten_to_dialect_pass",
+        ],
+    )
 
     runtime.python_test(
         name = "test_rank_0_to_rank_1",
diff --git a/backends/transforms/test/test_aten_to_dialect_pass.py b/backends/transforms/test/test_aten_to_dialect_pass.py
new file mode 100644
index 00000000000..80dbf210d72
--- /dev/null
+++ b/backends/transforms/test/test_aten_to_dialect_pass.py
@@ -0,0 +1,239 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pytest
+import torch
+from executorch.backends.transforms.aten_to_dialect_pass import (
+    AtenToDialectPass,
+    DialectNodeSpec,
+)
+from executorch.backends.transforms.utils import create_constant_placeholder
+from torch.export import ExportedProgram
+from torch.export.graph_signature import InputKind
+from torch.fx import Node
+
+
+class AddModel(torch.nn.Module):
+    def forward(self, x, y):
+        return torch.ops.aten.add.Tensor(x, y)
+
+
+class AddAlphaModel(torch.nn.Module):
+    def forward(self, x, y):
+        return torch.ops.aten.add.Tensor(x, y, alpha=2)
+
+
+def _count_target(graph_module: torch.fx.GraphModule, target) -> int:
+    return sum(
+        1
+        for node in graph_module.graph.nodes
+        if node.op == "call_function" and node.target == target
+    )
+
+
+def _get_target_node(graph_module: torch.fx.GraphModule, target) -> Node:
+    nodes = [
+        node
+        for node in graph_module.graph.nodes
+        if node.op == "call_function" and node.target == target
+    ]
+    assert len(nodes) == 1
+    return nodes[0]
+
+
+def _export_add_model() -> ExportedProgram:
+    return torch.export.export(
+        AddModel().eval(), (torch.randn(2, 3), torch.randn(2, 3)), strict=True
+    )
+
+
+def _export_add_alpha_model() -> ExportedProgram:
+    return torch.export.export(
+        AddAlphaModel().eval(), (torch.randn(2, 3), torch.randn(2, 3)), strict=True
+    )
+
+
+def test_rewrites_node_when_substitution_matches() -> None:
+    class _TestAtenToDialectPass(AtenToDialectPass):
+        pass
+
+    @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor)
+    def replace_add_with_sub(
+        node: Node, exported_program: ExportedProgram
+    ) -> DialectNodeSpec | None:
+        del exported_program
+        return DialectNodeSpec(torch.ops.aten.sub.Tensor, node.args)
+
+    exported_program = _export_add_model()
+    result = _TestAtenToDialectPass(exported_program=exported_program).call(
+        exported_program.graph_module
+    )
+
+    assert result.modified
+    assert _count_target(result.graph_module, torch.ops.aten.add.Tensor) == 0
+    assert _count_target(result.graph_module, torch.ops.aten.sub.Tensor) == 1
+
+
+def test_substitution_can_add_state_dict_placeholder() -> None:
+    class _TestAtenToDialectPass(AtenToDialectPass):
+        pass
+
+    @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor)
+    def replace_add_rhs_with_constant(
+        node: Node, exported_program: ExportedProgram
+    ) -> DialectNodeSpec | None:
+        first_placeholder = next(
+            graph_node
+            for graph_node in node.graph.nodes
+            if graph_node.op == "placeholder"
+        )
+        with node.graph.inserting_before(first_placeholder):
+            const_node = create_constant_placeholder(
+                exp_program=exported_program,
+                graph=node.graph,
+                name="test_constant",
+                kind=InputKind.PARAMETER,
+                data=torch.ones(2, 3),
+            )
+        return DialectNodeSpec(torch.ops.aten.add.Tensor, (node.args[0], const_node))
+
+    exported_program = _export_add_model()
+    result = _TestAtenToDialectPass(exported_program=exported_program).call(
+        exported_program.graph_module
+    )
+
+    assert result.modified
+    assert "test_constant" in exported_program.state_dict
+    assert torch.equal(exported_program.state_dict["test_constant"], torch.ones(2, 3))
+    assert (
+        exported_program.graph_signature.inputs_to_parameters["test_constant"]
+        == "test_constant"
+    )
+    add_node = _get_target_node(result.graph_module, torch.ops.aten.add.Tensor)
+    assert add_node.args[1].name == "test_constant"
+
+    x = torch.full((2, 3), 2.0)
+    y = torch.full((2, 3), 5.0)
+    torch.testing.assert_close(exported_program.module()(x, y), x + torch.ones_like(x))
+
+
+def test_substitution_can_change_kwargs() -> None:
+    class _TestAtenToDialectPass(AtenToDialectPass):
+        pass
+
+    @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor)
+    def replace_add_alpha(
+        node: Node, exported_program: ExportedProgram
+    ) -> DialectNodeSpec | None:
+        del exported_program
+        return DialectNodeSpec(torch.ops.aten.add.Tensor, node.args, {"alpha": 3})
+
+    exported_program = _export_add_alpha_model()
+    result = _TestAtenToDialectPass(exported_program=exported_program).call(
+        exported_program.graph_module
+    )
+
+    assert result.modified
+    add_node = _get_target_node(result.graph_module, torch.ops.aten.add.Tensor)
+    assert add_node.kwargs["alpha"] == 3
+
+    x = torch.full((2, 3), 2.0)
+    y = torch.full((2, 3), 5.0)
+    torch.testing.assert_close(exported_program.module()(x, y), x + 3 * y)
+
+
+def test_preserves_meta_when_substitution_matches() -> None:
+    class _TestAtenToDialectPass(AtenToDialectPass):
+        pass
+
+    @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor)
+    def replace_add_with_sub(
+        node: Node, exported_program: ExportedProgram
+    ) -> DialectNodeSpec | None:
+        del exported_program
+        return DialectNodeSpec(torch.ops.aten.sub.Tensor, node.args)
+
+    exported_program = _export_add_model()
+    add_node = _get_target_node(
+        exported_program.graph_module, torch.ops.aten.add.Tensor
+    )
+    add_node.meta["test_sentinel"] = "kept"
+    add_node.meta["stack_trace"] = "original stack"
+
+    result = _TestAtenToDialectPass(exported_program=exported_program).call(
+        exported_program.graph_module
+    )
+
+    sub_node = _get_target_node(result.graph_module, torch.ops.aten.sub.Tensor)
+    assert sub_node.meta["test_sentinel"] == "kept"
+    assert sub_node.meta["stack_trace"].startswith("original stack\n")
+    assert sub_node.meta["stack_trace"] != "original stack"
+
+
+def test_keeps_node_when_substitution_returns_none() -> None:
+    class _TestAtenToDialectPass(AtenToDialectPass):
+        pass
+
+    @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor)
+    def do_not_replace(
+        node: Node, exported_program: ExportedProgram
+    ) -> DialectNodeSpec | None:
+        del node, exported_program
+        return None
+
+    exported_program = _export_add_model()
+    result = _TestAtenToDialectPass(exported_program=exported_program).call(
+        exported_program.graph_module
+    )
+
+    assert not result.modified
+    assert _count_target(result.graph_module, torch.ops.aten.add.Tensor) == 1
+    assert _count_target(result.graph_module, torch.ops.aten.sub.Tensor) == 0
+
+
+def test_raises_when_duplicate_substitution_is_registered() -> None:
+    class _TestAtenToDialectPass(AtenToDialectPass):
+        pass
+
+    @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor)
+    def first_replace(
+        node: Node, exported_program: ExportedProgram
+    ) -> DialectNodeSpec | None:
+        del exported_program
+        return DialectNodeSpec(torch.ops.aten.sub.Tensor, node.args)
+
+    with pytest.raises(RuntimeError, match="Multiple substitutions registered"):
+
+        @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor)
+        def second_replace(
+            node: Node, exported_program: ExportedProgram
+        ) -> DialectNodeSpec | None:
+            del exported_program
+            return DialectNodeSpec(torch.ops.aten.mul.Tensor, node.args)
+
+
+def test_ensures_raises_when_call_function_count_changes() -> None:
+    class _TestAtenToDialectPass(AtenToDialectPass):
+        pass
+
+    exported_program = _export_add_model()
+    graph_module = exported_program.graph_module
+    test_pass = _TestAtenToDialectPass(exported_program=exported_program)
+    test_pass.requires(graph_module)
+
+    placeholders = [
+        node for node in graph_module.graph.nodes if node.op == "placeholder"
+    ]
+    output_node = next(node for node in graph_module.graph.nodes if node.op == "output")
+    with graph_module.graph.inserting_before(output_node):
+        graph_module.graph.create_node(
+            "call_function",
+            target=torch.ops.aten.sub.Tensor,
+            args=tuple(placeholders),
+            kwargs={},
+        )
+
+    with pytest.raises(RuntimeError, match="did not preserve"):
+        test_pass.ensures(graph_module)
diff --git a/examples/models/gemma4_31b/README.md b/examples/models/gemma4_31b/README.md
index da4aa893079..c6ac10748d8 100644
--- a/examples/models/gemma4_31b/README.md
+++ b/examples/models/gemma4_31b/README.md
@@ -15,6 +15,7 @@ both export and eager inference:
 |---|---|---|
 | `quantize_and_save.py` | bf16 HF checkpoint → quantized checkpoint (one-time) | ~30 GB CPU |
 | `export.py --prequantized <dir>` | quantized checkpoint → `model.pte` + `model.ptd` | ~24 GB CPU + CUDA for packing |
+| `export.py --gguf <file> [--backend mlx]` | GGUF file (Q4_K_M, etc.) → `model.pte` + `model.ptd` | ~24 GB CPU |
 | `inference.py --prequantized <dir>` | quantized checkpoint → eager generation under `torch.compile` | ~24 GB GPU |
 | `inference.py --gguf <file>` | GGUF file (Q4_K_M, etc.) → eager generation | ~24 GB GPU |
 | `export.py --model-dir <hf>` | one-shot bf16 → quantize → export (no intermediate file) | ~30 GB CPU + CUDA for packing |
diff --git a/examples/models/gemma4_31b/export.py b/examples/models/gemma4_31b/export.py
index 046e365947b..bd648f534b5 100644
--- a/examples/models/gemma4_31b/export.py
+++ b/examples/models/gemma4_31b/export.py
@@ -443,7 +443,12 @@ def main() -> None:
             backend=args.backend,
         )
 
-    export_and_lower(model, config, args.output_dir, backend=args.backend)
+    if args.gguf and args.backend == "mlx":
+        os.environ["ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS"] = "1"
+    try:
+        export_and_lower(model, config, args.output_dir, backend=args.backend)
+    finally:
+        os.environ.pop("ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS", None)
 
 
 if __name__ == "__main__":
diff --git a/examples/models/gemma4_31b/gguf_loader.py b/examples/models/gemma4_31b/gguf_loader.py
index 3e50991e553..35dddb5a0dc 100644
--- a/examples/models/gemma4_31b/gguf_loader.py
+++ b/examples/models/gemma4_31b/gguf_loader.py
@@ -12,6 +12,7 @@
 
 Usage:
     model, config = load_gguf_model("model.gguf", backend="cuda")
+    model, config = load_gguf_model("model.gguf", backend="mlx")
 """
 
 from typing import Optional
@@ -104,10 +105,11 @@ def load_gguf_model(
     Streams tensors one at a time for low peak memory.
 
     GGUF ties ``embed_tokens`` and ``lm_head`` into a single Q4_K tensor.
-    We untie them: the embedding is dequantized to bf16 (``nn.Embedding``
-    needs gather, which ``Int4TilePackedTo4dTensor`` does not support),
-    while ``lm_head`` keeps the original Q4_K quantization (``nn.Linear``
-    matmul via tinygemm).
+    We untie them so ``lm_head`` keeps the original Q4_K quantization.
+    On CUDA, the embedding is dequantized to bf16 because ``Int4Tensor``
+    does not support the gather op that ``nn.Embedding`` requires.  On
+    MLX, the embedding stays quantized — ``QuantizedEmbeddingHandler``
+    handles quantized gather natively.
 
     Returns ``(model, config)``.
     """
@@ -120,8 +122,12 @@ def load_gguf_model(
         from executorch.examples.models.gemma4_31b.quant import DEFAULT_CUDA_PACKERS
 
         packers = DEFAULT_CUDA_PACKERS
+    elif backend == "mlx":
+        from executorch.examples.models.gemma4_31b.quant import DEFAULT_MLX_PACKERS
+
+        packers = DEFAULT_MLX_PACKERS
     else:
-        raise ValueError(f"Unsupported backend: {backend!r}. Supported: 'cuda'.")
+        raise ValueError(f"Unsupported backend: {backend!r}. Supported: 'cuda', 'mlx'.")
 
     config = Gemma4_31BConfig(max_seq_len=max_seq_len)
 
@@ -143,7 +149,8 @@ def load_gguf_model(
 
         if model_key == "embed_tokens.weight" and isinstance(result, Int4Tensor):
             embed_quant = result
-            result = dequantize_weight(result, torch.bfloat16)
+            if backend == "cuda":
+                result = dequantize_weight(result, torch.bfloat16)
 
         pack_one(model, model_key, result, packers)
 
diff --git a/examples/models/gemma4_31b/quant/README.md b/examples/models/gemma4_31b/quant/README.md
index 2eacced4387..92ddbf97243 100644
--- a/examples/models/gemma4_31b/quant/README.md
+++ b/examples/models/gemma4_31b/quant/README.md
@@ -50,5 +50,3 @@ The format is compatible with torchao's `save_pretrained` / `load_pretrained`.
 
 - `pack_metal.py` — Metal backend packer.
 - `gguf.py` — extend with Q5_K, Q8_0 GGUF quant types.
-- Upstream `Int4TilePackedTo4dTensor.from_int4_tensor()` to torchao
-  to replace the manual conversion in `pack_int4_for_cuda`.
diff --git a/examples/models/gemma4_31b/quant/pack_mlx.py b/examples/models/gemma4_31b/quant/pack_mlx.py
index 63aeca426a8..d627c9c437c 100644
--- a/examples/models/gemma4_31b/quant/pack_mlx.py
+++ b/examples/models/gemma4_31b/quant/pack_mlx.py
@@ -22,7 +22,7 @@
 
 from .pack import ModulePackerFn, pack_model  # noqa: F401
 
-_MLX_SUPPORTED_GROUP_SIZES = (128, 64, 32)
+_MLX_SUPPORTED_GROUP_SIZES = (128, 64, 32, 16)
 
 
 # ---------------------------------------------------------------------------
@@ -126,7 +126,9 @@ def pack_for_mlx(module: nn.Module, weights: dict[str, torch.Tensor]) -> None:
     default dispatch produces the ``dequantize_affine → linear`` pattern
     MLX expects.  Regroups to a compatible group_size when needed (e.g.
     per-axis group_size=5376 → group_size=128) since MLX's
-    ``parse_dequant_node`` only accepts group_size in {32, 64, 128}.
+    ``parse_dequant_node`` only accepts group_size in {16, 32, 64, 128}.
+    Group sizes ≥ 32 use the fused ``QuantizedMatmulNode``; group_size=16
+    (e.g. GGUF Q6_K) falls back to ``DequantizeNode`` + matmul at export.
     """
     from torchao.quantization import IntxUnpackedToInt8Tensor
     from torchao.quantization.quantize_.workflows.int4.int4_tensor import Int4Tensor
diff --git a/examples/models/gemma4_31b/quant/tests/test_pack_mlx.py b/examples/models/gemma4_31b/quant/tests/test_pack_mlx.py
index ffb2e0e2dd3..2e6310b9c10 100644
--- a/examples/models/gemma4_31b/quant/tests/test_pack_mlx.py
+++ b/examples/models/gemma4_31b/quant/tests/test_pack_mlx.py
@@ -146,7 +146,7 @@ def test_regroup_preserves_dequant(self):
 
 class TestMlxGroupSize(unittest.TestCase):
     def test_passthrough(self):
-        for gs in (32, 64, 128):
+        for gs in (16, 32, 64, 128):
             self.assertEqual(_mlx_group_size(gs, 256), gs)
 
     def test_regroup_5376(self):
@@ -157,7 +157,49 @@ def test_regroup_256(self):
 
     def test_rejects_indivisible(self):
         with self.assertRaises(ValueError):
-            _mlx_group_size(48, 48)
+            _mlx_group_size(7, 7)
+
+
+class TestPackLinearGroupSize16(unittest.TestCase):
+    """Packing group_size=16 weights (GGUF Q6_K) preserves semantics."""
+
+    def _make_gs16_tensor(self, N=64, K=128):
+        from torchao.quantization import IntxUnpackedToInt8Tensor
+
+        return IntxUnpackedToInt8Tensor(
+            qdata=torch.randint(-32, 31, (N, K), dtype=torch.int8),
+            scale=torch.randn(N, K // 16, dtype=torch.bfloat16),
+            zero_point=torch.zeros(N, K // 16, dtype=torch.int8),
+            target_dtype=torch.int8,
+            block_size=(1, 16),
+            dtype=torch.bfloat16,
+            activation_quantization=None,
+        )
+
+    def test_dequant_preserves_values(self):
+        """Packing preserves the dequantized weight values."""
+        w = self._make_gs16_tensor(64, 128)
+        before = dequantize_weight(w, torch.float32)
+
+        module = nn.Linear(128, 64, bias=False)
+        pack_for_mlx(module, {"weight": w})
+        after = dequantize_weight(module.weight.data, torch.float32)
+
+        self.assertTrue(
+            torch.allclose(before, after, atol=1e-5),
+            f"max diff: {(before - after).abs().max():.6g}",
+        )
+
+    def test_forward_produces_valid_output(self):
+        """Packed gs=16 weight produces finite output in a linear forward."""
+        w = self._make_gs16_tensor(64, 128)
+        module = nn.Linear(128, 64, bias=False)
+        pack_for_mlx(module, {"weight": w})
+
+        x = torch.randn(1, 128, dtype=torch.bfloat16)
+        out = torch.nn.functional.linear(x, module.weight.data.dequantize())
+        self.assertEqual(out.shape, torch.Size([1, 64]))
+        self.assertFalse(torch.isnan(out).any())
 
 
 class TestPackEmbeddingForMlx(unittest.TestCase):
diff --git a/examples/models/gemma4_31b/tests/test_mlx_pipeline.py b/examples/models/gemma4_31b/tests/test_mlx_pipeline.py
index 0e62ab88e4b..37f61fddb0f 100644
--- a/examples/models/gemma4_31b/tests/test_mlx_pipeline.py
+++ b/examples/models/gemma4_31b/tests/test_mlx_pipeline.py
@@ -244,5 +244,84 @@ def test_export_to_pte(self):
             self.assertTrue(os.path.exists(os.path.join(out_dir, "model.pte")))
 
 
+class TestGgufMlxPipeline(unittest.TestCase):
+    """Test GGUF → MLX loading path with synthetic Q6_K-like tensors."""
+
+    def test_load_gguf_model_mlx_backend(self):
+        """gguf_loader.load_gguf_model accepts backend='mlx'."""
+        try:
+            import gguf  # noqa: F401
+        except ModuleNotFoundError:
+            self.skipTest("gguf package not installed")
+
+        from executorch.examples.models.gemma4_31b.gguf_loader import load_gguf_model
+
+        # Will fail on missing file, but NOT on "Unsupported backend".
+        with self.assertRaisesRegex((FileNotFoundError, OSError, RuntimeError), ".*"):
+            load_gguf_model("/nonexistent.gguf", backend="mlx")
+
+    def test_mlx_backend_rejects_unknown(self):
+        from executorch.examples.models.gemma4_31b.gguf_loader import load_gguf_model
+
+        with self.assertRaisesRegex(ValueError, "Unsupported backend"):
+            load_gguf_model("/nonexistent.gguf", backend="tpu")
+
+    def test_gs16_packing_preserves_values(self):
+        """Q6_K-like weight (gs=16) preserves dequantized values after packing."""
+        from executorch.examples.models.gemma4_31b.quant.pack_mlx import pack_for_mlx
+        from executorch.examples.models.gemma4_31b.quant.quantize import (
+            dequantize_weight,
+        )
+        from torchao.quantization import IntxUnpackedToInt8Tensor
+
+        w = IntxUnpackedToInt8Tensor(
+            qdata=torch.randint(-32, 31, (64, 128), dtype=torch.int8),
+            scale=torch.randn(64, 8, dtype=torch.bfloat16),
+            zero_point=torch.zeros(64, 8, dtype=torch.int8),
+            target_dtype=torch.int8,
+            block_size=(1, 16),
+            dtype=torch.bfloat16,
+            activation_quantization=None,
+        )
+        before = dequantize_weight(w, torch.float32)
+
+        module = nn.Linear(128, 64, bias=False)
+        pack_for_mlx(module, {"weight": w})
+        after = dequantize_weight(module.weight.data, torch.float32)
+
+        self.assertTrue(
+            torch.allclose(before, after, atol=1e-5),
+            f"max diff: {(before - after).abs().max():.6g}",
+        )
+
+    def test_embedding_packing_preserves_values(self):
+        """MLX embedding packing preserves dequantized weight values."""
+        from executorch.examples.models.gemma4_31b.quant.pack_mlx import pack_for_mlx
+        from executorch.examples.models.gemma4_31b.quant.quantize import (
+            dequantize_weight,
+        )
+        from torchao.quantization import IntxUnpackedToInt8Tensor
+
+        w = IntxUnpackedToInt8Tensor(
+            qdata=torch.randint(-8, 7, (256, 128), dtype=torch.int8),
+            scale=torch.randn(256, 4, dtype=torch.bfloat16),
+            zero_point=torch.zeros(256, 4, dtype=torch.bfloat16),
+            target_dtype=torch.int4,
+            block_size=(1, 32),
+            dtype=torch.bfloat16,
+            activation_quantization=None,
+        )
+        before = dequantize_weight(w, torch.float32)
+
+        module = nn.Embedding(256, 128)
+        pack_for_mlx(module, {"weight": w})
+        after = dequantize_weight(module.weight.data, torch.float32)
+
+        self.assertTrue(
+            torch.allclose(before, after, atol=1e-5),
+            f"max diff: {(before - after).abs().max():.6g}",
+        )
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/examples/nxp/executor_runner/nxp_executor_runner.cpp b/examples/nxp/executor_runner/nxp_executor_runner.cpp
index 65f5831e5c5..52d7c778227 100644
--- a/examples/nxp/executor_runner/nxp_executor_runner.cpp
+++ b/examples/nxp/executor_runner/nxp_executor_runner.cpp
@@ -384,71 +384,30 @@ int main(int argc, char* argv[]) {
   torch::executor::MemoryManager memory_manager(
       &method_allocator, &planned_memory, &tmp_allocator);
 
-  Result<torch::executor::Method> method =
-      program->load_method(method_name, &memory_manager);
-  if (!method.ok()) {
-    fprintf(
-        stderr,
-        "Loading of method (%s) failed with status %" PRIu32 "...\n",
-        method_name,
-        (unsigned int)method.error());
-    exit(-1);
-  }
-  printf("Method loaded...\n");
-
-  Error status = Error::Ok;
-  if (!FLAGS_dataset.empty()) {
-    // Go through entire dataset for this model.
-    FLAGS_dataset += "/";
-    while (dataset = readdir(datasetDir)) {
-      if (!strcmp(dataset->d_name, ".") || !strcmp(dataset->d_name, ".."))
-        continue;
-
-      std::vector<std::string> inputsData;
-      inputsData.push_back(FLAGS_dataset + dataset->d_name);
-      // Set input and call inferrence.
-      setInputs(method.get(), inputsData);
-
-      status = method->execute();
-      if (status != Error::Ok) {
-        fprintf(
-            stderr,
-            "Execution of method %s failed with status %" PRIu32 "...\n",
-            method_name,
-            (unsigned int)status);
-        exit(-1);
-      } else {
-        printf("Method executed successfully...\n");
-      }
-
-      // Save outputs in binary files.
-      saveOutputs(method.get(), FLAGS_output, dataset->d_name);
-      // Print result with highest confidence.
-      printOutput(method.get(), FLAGS_output, dataset->d_name);
+  {
+    Result<torch::executor::Method> method =
+        program->load_method(method_name, &memory_manager);
+    if (!method.ok()) {
+      fprintf(
+          stderr,
+          "Loading of method (%s) failed with status %" PRIu32 "...\n",
+          method_name,
+          (unsigned int)method.error());
+      exit(-1);
     }
-    closedir(datasetDir);
-  } else if (!FLAGS_inputs.empty()) {
-    std::vector<std::string> inputPaths;
-
-    // Validate and process inputs and separate into two lists.
-    processInputs(inputPaths, FLAGS_inputs);
-
-    if (std::all_of(inputPaths.begin(), inputPaths.end(), isDirectory)) {
-      // Inputs are in directories - use files in each directory as the inputs.
-      std::vector<std::string> inputsData;
-      for (std::string& inputDir : inputPaths) {
-        datasetDir = opendir(inputDir.c_str());
-        while (dataset = readdir(datasetDir)) {
-          if (!strcmp(dataset->d_name, ".") || !strcmp(dataset->d_name, ".."))
-            continue;
-
-          inputsData.push_back(inputDir + "/" + dataset->d_name);
-        }
-        closedir(datasetDir);
-
-        // Sort inputsData to ensure correct input ordering
-        std::sort(inputsData.begin(), inputsData.end());
-
+    printf("Method loaded...\n");
+
+    Error status = Error::Ok;
+    if (!FLAGS_dataset.empty()) {
+      // Go through entire dataset for this model.
+      FLAGS_dataset += "/";
+      while (dataset = readdir(datasetDir)) {
+        if (!strcmp(dataset->d_name, ".") || !strcmp(dataset->d_name, ".."))
+          continue;
+
+        std::vector<std::string> inputsData;
+        inputsData.push_back(FLAGS_dataset + dataset->d_name);
+        // Set input and call inferrence.
         setInputs(method.get(), inputsData);
 
         status = method->execute();
@@ -463,37 +422,81 @@ int main(int argc, char* argv[]) {
           printf("Method executed successfully...\n");
         }
 
-        if (inputDir.back() == '/')
-          inputDir.pop_back();
-
-        auto pos = inputDir.find_last_of('/');
-        if (pos != std::string::npos)
-          inputDir = inputDir.substr(pos + 1);
-
         // Save outputs in binary files.
-        saveOutputs(method.get(), FLAGS_output, inputDir.c_str());
-        inputsData.clear();
+        saveOutputs(method.get(), FLAGS_output, dataset->d_name);
+        // Print result with highest confidence.
+        printOutput(method.get(), FLAGS_output, dataset->d_name);
       }
-    } else {
-      // Inputs are files.
-      setInputs(method.get(), inputPaths);
-
-      status = method->execute();
-      if (status != Error::Ok) {
-        fprintf(
-            stderr,
-            "Execution of method %s failed with status %" PRIu32 "...\n",
-            method_name,
-            (unsigned int)status);
-        exit(-1);
+      closedir(datasetDir);
+    } else if (!FLAGS_inputs.empty()) {
+      std::vector<std::string> inputPaths;
+
+      // Validate and process inputs and separate into two lists.
+      processInputs(inputPaths, FLAGS_inputs);
+
+      if (std::all_of(inputPaths.begin(), inputPaths.end(), isDirectory)) {
+        // Inputs are in directories - use files in each directory as the
+        // inputs.
+        std::vector<std::string> inputsData;
+        for (std::string& inputDir : inputPaths) {
+          datasetDir = opendir(inputDir.c_str());
+          while (dataset = readdir(datasetDir)) {
+            if (!strcmp(dataset->d_name, ".") || !strcmp(dataset->d_name, ".."))
+              continue;
+
+            inputsData.push_back(inputDir + "/" + dataset->d_name);
+          }
+          closedir(datasetDir);
+
+          // Sort inputsData to ensure correct input ordering
+          std::sort(inputsData.begin(), inputsData.end());
+
+          setInputs(method.get(), inputsData);
+
+          status = method->execute();
+          if (status != Error::Ok) {
+            fprintf(
+                stderr,
+                "Execution of method %s failed with status %" PRIu32 "...\n",
+                method_name,
+                (unsigned int)status);
+            exit(-1);
+          } else {
+            printf("Method executed successfully...\n");
+          }
+
+          if (inputDir.back() == '/')
+            inputDir.pop_back();
+
+          auto pos = inputDir.find_last_of('/');
+          if (pos != std::string::npos)
+            inputDir = inputDir.substr(pos + 1);
+
+          // Save outputs in binary files.
+          saveOutputs(method.get(), FLAGS_output, inputDir.c_str());
+          inputsData.clear();
+        }
       } else {
-        printf("Method executed successfully...\n");
-      }
+        // Inputs are files.
+        setInputs(method.get(), inputPaths);
+
+        status = method->execute();
+        if (status != Error::Ok) {
+          fprintf(
+              stderr,
+              "Execution of method %s failed with status %" PRIu32 "...\n",
+              method_name,
+              (unsigned int)status);
+          exit(-1);
+        } else {
+          printf("Method executed successfully...\n");
+        }
 
-      // Save outputs in binary files.
-      saveOutputs(method.get(), FLAGS_output);
+        // Save outputs in binary files.
+        saveOutputs(method.get(), FLAGS_output);
+      }
     }
-  }
+  } // Destruct the method object before destroying the Neutron Device.
 
   printf("Finished...\n");
 
diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/AsrModuleInstrumentationTest.kt b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/AsrModuleInstrumentationTest.kt
new file mode 100644
index 00000000000..fe8a168e406
--- /dev/null
+++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/AsrModuleInstrumentationTest.kt
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+package org.pytorch.executorch
+
+import androidx.test.ext.junit.runners.AndroidJUnit4
+import java.io.File
+import java.io.IOException
+import org.apache.commons.io.FileUtils
+import org.junit.Assert.assertEquals
+import org.junit.Assert.assertFalse
+import org.junit.Assert.assertTrue
+import org.junit.Assert.fail
+import org.junit.Assume.assumeNotNull
+import org.junit.Test
+import org.junit.runner.RunWith
+import org.pytorch.executorch.TestFileUtils.getTestFilePath
+import org.pytorch.executorch.extension.asr.AsrCallback
+import org.pytorch.executorch.extension.asr.AsrModule
+import org.pytorch.executorch.extension.asr.AsrTranscribeConfig
+
+/**
+ * Instrumentation tests for [AsrModule], [AsrTranscribeConfig], and [AsrCallback].
+ *
+ * Tests cover:
+ * - Constructor validation (invalid model/tokenizer/preprocessor paths)
+ * - AsrTranscribeConfig builder and validation
+ * - Lifecycle (close idempotency, use-after-close)
+ * - Transcribe validation (invalid WAV path)
+ *
+ * The test fixture is the TinyStories-110M LLM model, NOT an ASR model, so functional transcription
+ * tests are not possible. Tests that require a valid AsrModule instance handle the case where
+ * nativeCreate fails (stories.pte lacks encoder/text_decoder methods).
+ */
+@RunWith(AndroidJUnit4::class)
+class AsrModuleInstrumentationTest {
+
+  // ─── Constructor validation ─────────────────────────────────────────────────
+
+  @Test(timeout = 30_000)
+  fun testInvalidModelPathThrows() {
+    try {
+      AsrModule("/nonexistent/model.pte", "/nonexistent/tokenizer")
+      fail("Should throw for invalid model path")
+    } catch (_: IllegalArgumentException) {
+      // Expected: require(modelFile.canRead() && modelFile.isFile)
+    }
+  }
+
+  @Test(timeout = 30_000)
+  fun testInvalidTokenizerPathThrows() {
+    val modelFile = provisionModelFile()
+    assumeNotNull("Test resource $MODEL_FILE_NAME not available", modelFile)
+    try {
+      AsrModule(modelFile!!.absolutePath, "/nonexistent/tokenizer")
+      fail("Should throw for invalid tokenizer path")
+    } catch (_: IllegalArgumentException) {
+      // Expected: require(tokenizerFile.exists())
+    }
+  }
+
+  @Test(timeout = 30_000)
+  fun testInvalidPreprocessorPathThrows() {
+    val modelFile = provisionModelFile()
+    val tokenizerFile = provisionTokenizerFile()
+    assumeNotNull("Test resource $MODEL_FILE_NAME not available", modelFile)
+    assumeNotNull("Test resource $TOKENIZER_FILE_NAME not available", tokenizerFile)
+    try {
+      AsrModule(
+          modelFile!!.absolutePath,
+          tokenizerFile!!.absolutePath,
+          preprocessorPath = "/nonexistent/preprocessor.pte",
+      )
+      fail("Should throw for invalid preprocessor path")
+    } catch (_: IllegalArgumentException) {
+      // Expected: require(preprocessorFile.canRead() && preprocessorFile.isFile)
+    }
+  }
+
+  @Test(timeout = 30_000)
+  fun testNonAsrModelFailsGracefully() {
+    val modelFile = provisionModelFile()
+    val tokenizerFile = provisionTokenizerFile()
+    assumeNotNull("Test resource $MODEL_FILE_NAME not available", modelFile)
+    assumeNotNull("Test resource $TOKENIZER_FILE_NAME not available", tokenizerFile)
+    try {
+      val module = AsrModule(modelFile!!.absolutePath, tokenizerFile!!.absolutePath)
+      // If construction succeeds (model was accepted), verify basic state
+      assertTrue("Module should be valid after construction", module.isValid)
+      module.close()
+    } catch (_: ExecutorchRuntimeException) {
+      // Expected: nativeCreate returns 0 for non-ASR model
+    } catch (_: RuntimeException) {
+      // Also acceptable: native layer rejects the model
+    }
+  }
+
+  // ─── Lifecycle ──────────────────────────────────────────────────────────────
+
+  @Test(timeout = 30_000)
+  fun testCloseIsIdempotent() {
+    val module = tryCreateAsrModule() ?: return
+    module.close()
+    module.close()
+    module.close()
+    assertFalse("isValid must be false after close", module.isValid)
+  }
+
+  @Test(timeout = 30_000)
+  fun testLoadAfterCloseThrows() {
+    val module = tryCreateAsrModule() ?: return
+    module.close()
+    try {
+      module.load()
+      fail("load() after close() must throw IllegalStateException")
+    } catch (_: IllegalStateException) {
+      // Expected
+    }
+  }
+
+  @Test(timeout = 30_000)
+  fun testTranscribeAfterCloseThrows() {
+    val module = tryCreateAsrModule() ?: return
+    module.close()
+    try {
+      module.transcribe("/some/audio.wav")
+      fail("transcribe() after close() must throw IllegalStateException")
+    } catch (_: IllegalStateException) {
+      // Expected
+    }
+  }
+
+  @Test(timeout = 30_000)
+  fun testIsValidAndIsLoadedState() {
+    val module = tryCreateAsrModule() ?: return
+    assertTrue("Module should be valid after construction", module.isValid)
+    module.close()
+    assertFalse("Module should not be valid after close", module.isValid)
+    assertFalse("Module should not be loaded after close", module.isLoaded)
+  }
+
+  // ─── Transcribe validation ──────────────────────────────────────────────────
+
+  @Test(timeout = 30_000)
+  fun testTranscribeInvalidWavPathThrows() {
+    val module = tryCreateAsrModule() ?: return
+    try {
+      module.transcribe("/nonexistent/audio.wav")
+      fail("transcribe() with invalid WAV path must throw")
+    } catch (_: IllegalArgumentException) {
+      // Expected: require(wavFile.canRead() && wavFile.isFile)
+    } finally {
+      module.close()
+    }
+  }
+
+  // ─── AsrTranscribeConfig ────────────────────────────────────────────────────
+
+  @Test
+  fun testConfigDefaults() {
+    val config = AsrTranscribeConfig()
+    assertEquals(128L, config.maxNewTokens)
+    assertEquals(0.0f, config.temperature, 0.0f)
+    assertEquals(0L, config.decoderStartTokenId)
+  }
+
+  @Test
+  fun testConfigBuilder() {
+    val config =
+        AsrTranscribeConfig.Builder()
+            .setMaxNewTokens(256)
+            .setTemperature(0.7f)
+            .setDecoderStartTokenId(50258)
+            .build()
+    assertEquals(256L, config.maxNewTokens)
+    assertEquals(0.7f, config.temperature, 0.001f)
+    assertEquals(50258L, config.decoderStartTokenId)
+  }
+
+  @Test
+  fun testConfigCustomValues() {
+    val config = AsrTranscribeConfig(maxNewTokens = 64, temperature = 0.5f, decoderStartTokenId = 1)
+    assertEquals(64L, config.maxNewTokens)
+    assertEquals(0.5f, config.temperature, 0.001f)
+    assertEquals(1L, config.decoderStartTokenId)
+  }
+
+  @Test(expected = IllegalArgumentException::class)
+  fun testConfigZeroMaxNewTokensThrows() {
+    AsrTranscribeConfig(maxNewTokens = 0)
+  }
+
+  @Test(expected = IllegalArgumentException::class)
+  fun testConfigNegativeMaxNewTokensThrows() {
+    AsrTranscribeConfig(maxNewTokens = -1)
+  }
+
+  @Test(expected = IllegalArgumentException::class)
+  fun testConfigNegativeTemperatureThrows() {
+    AsrTranscribeConfig(temperature = -0.1f)
+  }
+
+  @Test(expected = IllegalArgumentException::class)
+  fun testConfigBuilderZeroMaxNewTokensThrows() {
+    AsrTranscribeConfig.Builder().setMaxNewTokens(0).build()
+  }
+
+  @Test(expected = IllegalArgumentException::class)
+  fun testConfigBuilderNegativeTemperatureThrows() {
+    AsrTranscribeConfig.Builder().setTemperature(-1.0f).build()
+  }
+
+  @Test
+  fun testConfigDataClassEquality() {
+    val a = AsrTranscribeConfig(maxNewTokens = 100, temperature = 0.5f, decoderStartTokenId = 42)
+    val b = AsrTranscribeConfig(maxNewTokens = 100, temperature = 0.5f, decoderStartTokenId = 42)
+    assertEquals(a, b)
+    assertEquals(a.hashCode(), b.hashCode())
+  }
+
+  // ─── Helpers ────────────────────────────────────────────────────────────────
+
+  @Throws(IOException::class)
+  private fun provisionModelFile(): File? {
+    val pteFile = File(getTestFilePath(MODEL_FILE_NAME))
+    val stream = javaClass.getResourceAsStream(MODEL_FILE_NAME) ?: return null
+    stream.use { FileUtils.copyInputStreamToFile(it, pteFile) }
+    return pteFile
+  }
+
+  @Throws(IOException::class)
+  private fun provisionTokenizerFile(): File? {
+    val tokenizerFile = File(getTestFilePath(TOKENIZER_FILE_NAME))
+    val stream = javaClass.getResourceAsStream(TOKENIZER_FILE_NAME) ?: return null
+    stream.use { FileUtils.copyInputStreamToFile(it, tokenizerFile) }
+    return tokenizerFile
+  }
+
+  private fun tryCreateAsrModule(): AsrModule? {
+    val modelFile = provisionModelFile()
+    val tokenizerFile = provisionTokenizerFile()
+    assumeNotNull("Test resource $MODEL_FILE_NAME not available", modelFile)
+    assumeNotNull("Test resource $TOKENIZER_FILE_NAME not available", tokenizerFile)
+    return try {
+      AsrModule(modelFile!!.absolutePath, tokenizerFile!!.absolutePath)
+    } catch (_: RuntimeException) {
+      // nativeCreate may reject non-ASR models — skip lifecycle tests in that case
+      null
+    }
+  }
+
+  companion object {
+    private const val MODEL_FILE_NAME = "/stories.pte"
+    private const val TOKENIZER_FILE_NAME = "/tokenizer.bin"
+  }
+}
diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmLoraInstrumentationTest.kt b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmLoraInstrumentationTest.kt
new file mode 100644
index 00000000000..a8d35b09de2
--- /dev/null
+++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmLoraInstrumentationTest.kt
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+package org.pytorch.executorch
+
+import androidx.test.ext.junit.runners.AndroidJUnit4
+import java.io.File
+import java.io.IOException
+import org.apache.commons.io.FileUtils
+import org.junit.After
+import org.junit.Assert.assertTrue
+import org.junit.Assert.fail
+import org.junit.Before
+import org.junit.Test
+import org.junit.runner.RunWith
+import org.pytorch.executorch.TestFileUtils.getTestFilePath
+import org.pytorch.executorch.extension.llm.LlmCallback
+import org.pytorch.executorch.extension.llm.LlmModule
+import org.pytorch.executorch.extension.llm.LlmModuleConfig
+
+/**
+ * Instrumentation tests for LlmModule's LoRA / dataFiles constructor paths.
+ *
+ * LoRA adapters are loaded at construction time via the `dataFiles` parameter or
+ * `LlmModuleConfig.dataPath`. These tests verify that:
+ * 1. The dataFiles constructor variants produce a functional module
+ * 2. LlmModuleConfig with dataPath integrates correctly
+ * 3. Invalid data file paths are handled gracefully
+ * 4. Empty vs null dataFiles behave identically to no-data constructors
+ *
+ * Uses TinyStories-110M; no LoRA adapter fixture is available so functional LoRA tests
+ * (output-changes-with-adapter) are not possible.
+ */
+@RunWith(AndroidJUnit4::class)
+class LlmLoraInstrumentationTest {
+
+  private var llmModule: LlmModule? = null
+
+  @Before
+  @Throws(IOException::class)
+  fun setUp() {
+    val pteFile = File(getTestFilePath(MODEL_FILE_NAME))
+    requireNotNull(javaClass.getResourceAsStream(MODEL_FILE_NAME)) {
+          "Test resource $MODEL_FILE_NAME not found; did android_test_setup.sh run?"
+        }
+        .use { FileUtils.copyInputStreamToFile(it, pteFile) }
+
+    val tokenizerFile = File(getTestFilePath(TOKENIZER_FILE_NAME))
+    requireNotNull(javaClass.getResourceAsStream(TOKENIZER_FILE_NAME)) {
+          "Test resource $TOKENIZER_FILE_NAME not found; did android_test_setup.sh run?"
+        }
+        .use { FileUtils.copyInputStreamToFile(it, tokenizerFile) }
+  }
+
+  @After
+  fun tearDown() {
+    llmModule?.close()
+    llmModule = null
+  }
+
+  // ─── dataFiles constructor variants ─────────────────────────────────────────
+
+  @Test(timeout = MAX_TEST_TIMEOUT_MS)
+  fun testConstructorWithEmptyDataFilesList() {
+    llmModule =
+        LlmModule(
+            LlmModule.MODEL_TYPE_TEXT,
+            getTestFilePath(MODEL_FILE_NAME),
+            getTestFilePath(TOKENIZER_FILE_NAME),
+            0.0f,
+            emptyList<String>(),
+        )
+    val tokens = generateAndCollect(llmModule!!)
+    assertTrue("Module with empty dataFiles should generate tokens", tokens.isNotEmpty())
+  }
+
+  @Test(timeout = MAX_TEST_TIMEOUT_MS)
+  fun testConstructorWithNullDataPath() {
+    llmModule =
+        LlmModule(
+            LlmModule.MODEL_TYPE_TEXT,
+            getTestFilePath(MODEL_FILE_NAME),
+            getTestFilePath(TOKENIZER_FILE_NAME),
+            0.0f,
+            null as String?,
+        )
+    val tokens = generateAndCollect(llmModule!!)
+    assertTrue("Module with null dataPath should generate tokens", tokens.isNotEmpty())
+  }
+
+  @Test(timeout = MAX_TEST_TIMEOUT_MS)
+  fun testConstructorWithDataFilesAndBosEos() {
+    llmModule =
+        LlmModule(
+            LlmModule.MODEL_TYPE_TEXT,
+            getTestFilePath(MODEL_FILE_NAME),
+            getTestFilePath(TOKENIZER_FILE_NAME),
+            0.0f,
+            emptyList<String>(),
+            0,
+            0,
+        )
+    val tokens = generateAndCollect(llmModule!!)
+    assertTrue("Module with dataFiles+BOS/EOS should generate tokens", tokens.isNotEmpty())
+  }
+
+  // ─── LlmModuleConfig with dataPath ──────────────────────────────────────────
+
+  @Test(timeout = MAX_TEST_TIMEOUT_MS)
+  fun testLlmModuleConfigNoDataPath() {
+    val config =
+        LlmModuleConfig.create()
+            .modulePath(getTestFilePath(MODEL_FILE_NAME))
+            .tokenizerPath(getTestFilePath(TOKENIZER_FILE_NAME))
+            .temperature(0.0f)
+            .build()
+    llmModule = LlmModule(config)
+    val tokens = generateAndCollect(llmModule!!)
+    assertTrue("Module via config with no dataPath should generate tokens", tokens.isNotEmpty())
+  }
+
+  @Test(timeout = MAX_TEST_TIMEOUT_MS)
+  fun testLlmModuleConfigWithNullDataPath() {
+    val config =
+        LlmModuleConfig.create()
+            .modulePath(getTestFilePath(MODEL_FILE_NAME))
+            .tokenizerPath(getTestFilePath(TOKENIZER_FILE_NAME))
+            .temperature(0.0f)
+            .dataPath(null)
+            .build()
+    llmModule = LlmModule(config)
+    val tokens = generateAndCollect(llmModule!!)
+    assertTrue("Module via config with null dataPath should generate tokens", tokens.isNotEmpty())
+  }
+
+  @Test(timeout = MAX_TEST_TIMEOUT_MS)
+  fun testLlmModuleConfigWithLoadMode() {
+    val config =
+        LlmModuleConfig.create()
+            .modulePath(getTestFilePath(MODEL_FILE_NAME))
+            .tokenizerPath(getTestFilePath(TOKENIZER_FILE_NAME))
+            .temperature(0.0f)
+            .loadMode(LlmModuleConfig.LOAD_MODE_FILE)
+            .build()
+    llmModule = LlmModule(config)
+    val tokens = generateAndCollect(llmModule!!)
+    assertTrue("Module via config with LOAD_MODE_FILE should generate tokens", tokens.isNotEmpty())
+  }
+
+  // ─── Invalid data file paths ────────────────────────────────────────────────
+
+  @Test(timeout = MAX_TEST_TIMEOUT_MS)
+  fun testInvalidDataFilePathThrowsOnConstruction() {
+    try {
+      llmModule =
+          LlmModule(
+              LlmModule.MODEL_TYPE_TEXT,
+              getTestFilePath(MODEL_FILE_NAME),
+              getTestFilePath(TOKENIZER_FILE_NAME),
+              0.0f,
+              listOf("/nonexistent/lora_weights.bin"),
+          )
+      // dataFiles are passed to native initHybrid — invalid paths should cause
+      // construction to fail. If we reach here, the native layer didn't validate.
+      llmModule!!.close()
+      fail("Construction should have thrown for invalid data file path")
+    } catch (e: RuntimeException) {
+      assertTrue(
+          "Exception message should be non-empty",
+          e.message != null && e.message!!.isNotEmpty(),
+      )
+    }
+  }
+
+  @Test(timeout = MAX_TEST_TIMEOUT_MS)
+  fun testMultipleInvalidDataFilePathsThrowOnConstruction() {
+    try {
+      llmModule =
+          LlmModule(
+              LlmModule.MODEL_TYPE_TEXT,
+              getTestFilePath(MODEL_FILE_NAME),
+              getTestFilePath(TOKENIZER_FILE_NAME),
+              0.0f,
+              listOf("/nonexistent/a.bin", "/nonexistent/b.bin"),
+          )
+      llmModule!!.close()
+      fail("Construction should have thrown for invalid data file paths")
+    } catch (e: RuntimeException) {
+      assertTrue(
+          "Exception message should be non-empty",
+          e.message != null && e.message!!.isNotEmpty(),
+      )
+    }
+  }
+
+  // ─── Baseline equivalence ───────────────────────────────────────────────────
+
+  @Test(timeout = MAX_TEST_TIMEOUT_MS)
+  fun testEmptyDataFilesMatchesNoDataConstructor() {
+    val moduleNoData =
+        LlmModule(getTestFilePath(MODEL_FILE_NAME), getTestFilePath(TOKENIZER_FILE_NAME), 0.0f)
+    val moduleEmptyList =
+        LlmModule(
+            LlmModule.MODEL_TYPE_TEXT,
+            getTestFilePath(MODEL_FILE_NAME),
+            getTestFilePath(TOKENIZER_FILE_NAME),
+            0.0f,
+            emptyList<String>(),
+        )
+
+    try {
+      val tokensNoData = generateAndCollect(moduleNoData)
+      val tokensEmptyList = generateAndCollect(moduleEmptyList)
+
+      assertTrue("Both constructors should produce tokens", tokensNoData.isNotEmpty())
+      assertTrue("Both constructors should produce tokens", tokensEmptyList.isNotEmpty())
+    } finally {
+      moduleNoData.close()
+      moduleEmptyList.close()
+    }
+  }
+
+  // ─── LlmModuleConfig builder validation ─────────────────────────────────────
+
+  @Test(expected = IllegalArgumentException::class)
+  fun testConfigBuilderMissingModulePathThrows() {
+    LlmModuleConfig.create().tokenizerPath("/some/tokenizer.bin").build()
+  }
+
+  @Test(expected = IllegalArgumentException::class)
+  fun testConfigBuilderMissingTokenizerPathThrows() {
+    LlmModuleConfig.create().modulePath("/some/model.pte").build()
+  }
+
+  @Test(expected = IllegalArgumentException::class)
+  fun testConfigBuilderInvalidLoadModeThrows() {
+    LlmModuleConfig.create()
+        .modulePath("/some/model.pte")
+        .tokenizerPath("/some/tokenizer.bin")
+        .loadMode(99)
+        .build()
+  }
+
+  @Test
+  fun testConfigBuilderAllLoadModes() {
+    val modes =
+        listOf(
+            LlmModuleConfig.LOAD_MODE_FILE,
+            LlmModuleConfig.LOAD_MODE_MMAP,
+            LlmModuleConfig.LOAD_MODE_MMAP_USE_MLOCK,
+            LlmModuleConfig.LOAD_MODE_MMAP_USE_MLOCK_IGNORE_ERRORS,
+        )
+    for (mode in modes) {
+      val config =
+          LlmModuleConfig.create()
+              .modulePath("/some/model.pte")
+              .tokenizerPath("/some/tokenizer.bin")
+              .loadMode(mode)
+              .build()
+      assertTrue("Config should accept load mode $mode", config.loadMode == mode)
+    }
+  }
+
+  // ─── Helpers ────────────────────────────────────────────────────────────────
+
+  private fun generateAndCollect(module: LlmModule): List<String> {
+    val collector = mutableListOf<String>()
+    module.generate(
+        TEST_PROMPT,
+        SEQ_LEN,
+        object : LlmCallback {
+          override fun onResult(result: String) {
+            collector.add(result)
+          }
+        },
+    )
+    return collector
+  }
+
+  companion object {
+    private const val MODEL_FILE_NAME = "/stories.pte"
+    private const val TOKENIZER_FILE_NAME = "/tokenizer.bin"
+    private const val TEST_PROMPT = "Once"
+    private const val SEQ_LEN = 16
+    private const val MAX_TEST_TIMEOUT_MS = 120_000L
+  }
+}
diff --git a/shim b/shim
index b295819bb0e..cf6a954aae4 160000
--- a/shim
+++ b/shim
@@ -1 +1 @@
-Subproject commit b295819bb0ec636b4e3359828e05476d2437650a
+Subproject commit cf6a954aae4bee7b4515e13475878460115027d1
diff --git a/third-party/ao b/third-party/ao
index 02105d46c61..01849b2b19c 160000
--- a/third-party/ao
+++ b/third-party/ao
@@ -1 +1 @@
-Subproject commit 02105d46c61dc80a8c9d39d5836e827ba3af8439
+Subproject commit 01849b2b19cb923cb739a1fc02297ba418ddf715
diff --git a/third-party/pocketfft b/third-party/pocketfft
index 81874074463..0fa0ef591e3 160000
--- a/third-party/pocketfft
+++ b/third-party/pocketfft
@@ -1 +1 @@
-Subproject commit 8187407446316c3d16f15e5395dabd4b22f4fec7
+Subproject commit 0fa0ef591e38c2758e3184c6c23e497b9f732ffa

From 4491310bdcbccf7513f6955b83b14be681b41830 Mon Sep 17 00:00:00 2001
From: Suraj Raut <sraut@cadence.com>
Date: Fri, 29 May 2026 01:09:51 -0700
Subject: [PATCH 6/7] Reset backends/cadence/aot/ to upstream (keep
 functions_vision.yaml)

---
 backends/cadence/aot/compiler_funcs.py        |  30 +++
 backends/cadence/aot/pass_utils.py            |  17 ++
 backends/cadence/aot/quantizer/BUCK           |  15 ++
 .../cadence/aot/quantizer/pattern_utils.py    | 207 ++++++++++++++++++
 backends/cadence/aot/quantizer/patterns.py    |  18 +-
 backends/cadence/aot/quantizer/utils.py       |   4 +-
 6 files changed, 289 insertions(+), 2 deletions(-)
 create mode 100644 backends/cadence/aot/quantizer/pattern_utils.py

diff --git a/backends/cadence/aot/compiler_funcs.py b/backends/cadence/aot/compiler_funcs.py
index 02dcde7fd39..cec3cb7d016 100644
--- a/backends/cadence/aot/compiler_funcs.py
+++ b/backends/cadence/aot/compiler_funcs.py
@@ -14,6 +14,7 @@
 import torch
 from torch._inductor.decomposition import remove_decompositions
 from torch.fx import GraphModule
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
 from torchao.quantization.pt2e.quantize_pt2e import prepare_pt2e, prepare_qat_pt2e
 from torchao.quantization.pt2e.quantizer import Quantizer
 
@@ -607,3 +608,32 @@ def sink_input_dequant_through_transparent_ops(
         graph_module.recompile()
 
     return modified
+
+
+class QuantFusionPass(PassBase):
+    """
+    Iterates patterns, finds anchor ops in the converted graph, and calls
+    pattern.fuse() to replace dq-op-q subgraphs with fused ops.
+    """
+
+    def __init__(self, patterns: Sequence[object]) -> None:
+        super().__init__()
+        self.patterns = patterns
+
+    def call(self, graph_module: GraphModule) -> Optional[PassResult]:
+        changed = False
+        for pattern in self.patterns:
+            pattern_changed = False
+            for target in pattern.anchor_ops():  # pyre-ignore[16]
+                for node in graph_module.graph.find_nodes(
+                    op="call_function", target=target
+                ):
+                    result = pattern.fuse(graph_module, node)  # pyre-ignore[16]
+                    if result is not None:
+                        changed = True
+                        pattern_changed = True
+            if pattern_changed:
+                graph_module.graph.eliminate_dead_code()
+        if changed:
+            graph_module.recompile()
+        return PassResult(graph_module, changed)
diff --git a/backends/cadence/aot/pass_utils.py b/backends/cadence/aot/pass_utils.py
index ab42ef43d56..091605e94ec 100644
--- a/backends/cadence/aot/pass_utils.py
+++ b/backends/cadence/aot/pass_utils.py
@@ -212,3 +212,20 @@ def nodes_not_adjacent_in_gm(
 def none_throws(x: Optional[PassResult]) -> PassResult:
     assert x is not None
     return x
+
+
+def replace_with_op(
+    gm: torch.fx.GraphModule,
+    insert_after: torch.fx.Node,
+    replacement_op: torch._ops.OpOverload,
+    args: tuple,  # pyre-ignore[2]
+    kwargs: dict,  # pyre-ignore[2]
+    node_to_replace: torch.fx.Node,
+) -> torch.fx.Node:
+    """Insert ``replacement_op`` after ``insert_after`` and replace all uses of
+    ``node_to_replace`` with the new node."""
+    with gm.graph.inserting_after(insert_after):
+        new_node = gm.graph.call_function(replacement_op, args, kwargs)
+    new_node.meta = node_to_replace.meta
+    node_to_replace.replace_all_uses_with(new_node)
+    return new_node
diff --git a/backends/cadence/aot/quantizer/BUCK b/backends/cadence/aot/quantizer/BUCK
index 34fec2556f8..c2ec3e3a1f6 100644
--- a/backends/cadence/aot/quantizer/BUCK
+++ b/backends/cadence/aot/quantizer/BUCK
@@ -14,6 +14,21 @@ fbcode_target(_kind = runtime.python_library,
     ],
 )
 
+fbcode_target(_kind = runtime.python_library,
+    name = "pattern_utils",
+    srcs = [
+        "pattern_utils.py",
+    ],
+    typing = True,
+    deps = [
+        ":utils",
+        "//caffe2:torch",
+        "//executorch/backends/cadence/aot:compiler_utils",
+        "//executorch/backends/cadence/aot:pass_utils",
+        "//executorch/backends/cadence/aot:utils",
+    ],
+)
+
 fbcode_target(_kind = runtime.python_library,
     name = "patterns",
     srcs = [
diff --git a/backends/cadence/aot/quantizer/pattern_utils.py b/backends/cadence/aot/quantizer/pattern_utils.py
new file mode 100644
index 00000000000..25ff363ecc9
--- /dev/null
+++ b/backends/cadence/aot/quantizer/pattern_utils.py
@@ -0,0 +1,207 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import operator
+from typing import Any
+
+import torch
+from executorch.backends.cadence.aot.pass_utils import get_arg, replace_with_op
+from executorch.backends.cadence.aot.quantizer.utils import (
+    copy_node_metadata,
+    create_zero_bias_int32,
+    quantize_tensor_multiplier,
+)
+from executorch.backends.cadence.aot.utils import is_depthwise_conv
+from torch import fx
+from torch._ops import OpOverload
+
+DQ_PER_TENSOR: OpOverload = torch.ops.quantized_decomposed.dequantize_per_tensor.default
+Q_PER_TENSOR: OpOverload = torch.ops.quantized_decomposed.quantize_per_tensor.default
+
+
+def insert_node_with_meta(
+    gm: fx.GraphModule,
+    op: OpOverload,
+    args: tuple[Any, ...],
+    kwargs: dict[str, Any] | None,
+    insert_before: fx.Node,
+    like_node: fx.Node,
+) -> fx.Node:
+    """Create a new node and populate its FakeTensor metadata.
+
+    Inserts ``op(*args, **kwargs)`` before ``insert_before``, runs the op
+    under ``like_node``'s fake_mode to compute ``meta["val"]``, and copies
+    remaining metadata from ``like_node``.
+    """
+    with gm.graph.inserting_before(insert_before):
+        node = gm.graph.call_function(op, args, kwargs or {})
+    assert "val" in like_node.meta
+    fake_mode = like_node.meta["val"].fake_mode
+    assert fake_mode is not None
+
+    def _resolve(x: Any) -> Any:
+        return x.meta["val"] if isinstance(x, fx.Node) else x
+
+    fake_args = tuple(_resolve(a) for a in args)
+    fake_kwargs = {k: _resolve(v) for k, v in (kwargs or {}).items()}
+    with fake_mode:
+        node.meta["val"] = op(*fake_args, **fake_kwargs)
+    copy_node_metadata(node, like_node)
+    return node
+
+
+def find_quant_user(node: fx.Node) -> fx.Node | None:
+    """Find the first quantize_per_tensor user of ``node``, traversing through getitem."""
+    users = list(node.users)
+    if not users:
+        return None
+    user = users[0]
+    if user.target is operator.getitem:
+        if user.args[1] == 0:
+            users = list(user.users)
+            if not users:
+                return None
+            user = users[0]
+        else:
+            return None
+    if user.target == Q_PER_TENSOR:
+        return user
+    return None
+
+
+def fuse_conv(
+    pattern: object,
+    gm: fx.GraphModule,
+    conv_node: fx.Node,
+    dq_input: fx.Node,
+    dq_weight: fx.Node,
+    quant_node: fx.Node,
+) -> fx.Node:
+    """Fuse a dq->conv->q chain into a single quantized conv op."""
+    dq_bias = None
+    if len(conv_node.args) > 2 and conv_node.args[2] is not None:
+        bias_arg = conv_node.args[2]
+        assert isinstance(bias_arg, fx.Node)
+        dq_bias = bias_arg if bias_arg.target == DQ_PER_TENSOR else None
+    weight_scale = get_arg(dq_weight, "scale", float)
+    input_scale = get_arg(dq_input, "scale", float)
+    bias_scale = input_scale * weight_scale
+    if dq_bias is not None:
+        bias_q = get_arg(dq_bias, "input", fx.Node)
+    else:
+        # Cadence quantized conv ops require a non-optional bias argument.
+        weight_node = get_arg(dq_weight, "input", fx.Node)
+        with gm.graph.inserting_before(conv_node):
+            bias_q = create_zero_bias_int32(gm, weight_node, bias_scale)
+    requantize_scale = bias_scale / get_arg(quant_node, "scale", float)
+    requantize_scale_t = torch.tensor([requantize_scale])
+    out_multiplier, out_shift = quantize_tensor_multiplier(requantize_scale_t)
+    args = (
+        get_arg(dq_input, "input", fx.Node),
+        get_arg(dq_weight, "input", fx.Node),
+        bias_q,
+    )
+    groups = get_arg(conv_node, "groups", int)
+    kwargs = {
+        "stride": get_arg(conv_node, "stride", list[int]),
+        "padding": get_arg(conv_node, "padding", list[int]),
+        "dilation": get_arg(conv_node, "dilation", list[int]),
+        "groups": groups,
+        "input_zero_point": get_arg(dq_input, "zero_point", int),
+        "weight_zero_point": get_arg(dq_weight, "zero_point", int),
+        "bias_scale": bias_scale,
+        "out_scale": get_arg(quant_node, "scale", float),
+        "out_zero_point": get_arg(quant_node, "zero_point", int),
+        "out_multiplier": out_multiplier[0].item(),
+        "out_shift": out_shift[0].item(),
+    }
+    replacement_op = pattern.replacement_op()  # pyre-ignore[16]
+    if replacement_op == torch.ops.cadence.quantized_conv1d_ncl.per_tensor:
+        input_node = get_arg(dq_input, "input", fx.Node)
+        assert len(input_node.meta["val"].shape) >= 2
+        in_channels = input_node.meta["val"].shape[1]
+        if is_depthwise_conv(groups, in_channels):
+            replacement_op = torch.ops.cadence.quantized_depthwise_conv1d_ncl.per_tensor
+    return replace_with_op(gm, conv_node, replacement_op, args, kwargs, quant_node)
+
+
+def fuse_linear(
+    gm: fx.GraphModule,
+    dq_input: fx.Node,
+    dq_weight: fx.Node,
+    dq_bias: fx.Node | None,
+    quant_node: fx.Node,
+    op_node: fx.Node,
+    replacement_op: OpOverload,
+    weight_q: fx.Node | None = None,
+) -> fx.Node:
+    """Fuse a dq->linear->q chain into a single quantized linear op."""
+    assert op_node.target in (
+        torch.ops.aten.linear.default,
+        torch.ops.aten.addmm.default,
+    ), f"Expected linear/addmm, got {op_node.target}"
+    weight_scale = get_arg(dq_weight, "scale", float)
+    input_scale = get_arg(dq_input, "scale", float)
+    bias_scale = input_scale * weight_scale
+    requantize_scale = bias_scale / get_arg(quant_node, "scale", float)
+    requantize_scale_t = torch.tensor([requantize_scale])
+    out_multiplier, out_shift = quantize_tensor_multiplier(requantize_scale_t)
+    if dq_bias is not None:
+        bias_q = get_arg(dq_bias, "input", fx.Node)
+    else:
+        # Cadence quantized linear ops require a non-optional bias argument.
+        weight_node = get_arg(dq_weight, "input", fx.Node)
+        with gm.graph.inserting_before(op_node):
+            bias_q = create_zero_bias_int32(gm, weight_node, bias_scale)
+    final_weight = (
+        weight_q if weight_q is not None else get_arg(dq_weight, "input", fx.Node)
+    )
+    args = (get_arg(dq_input, "input", fx.Node), final_weight, bias_q)
+    kwargs = {
+        "src_zero_point": get_arg(dq_input, "zero_point", int),
+        "weight_zero_point": get_arg(dq_weight, "zero_point", int),
+        "out_multiplier": out_multiplier[0].item(),
+        "out_shift": out_shift[0].item(),
+        "out_zero_point": get_arg(quant_node, "zero_point", int),
+        "offset": None,
+    }
+    return replace_with_op(gm, op_node, replacement_op, args, kwargs, quant_node)
+
+
+def fuse_matmul(
+    gm: fx.GraphModule,
+    anchor_node: fx.Node,
+    dq0: fx.Node,
+    dq1: fx.Node,
+    quant_node: fx.Node,
+    replacement_op: OpOverload,
+) -> fx.Node:
+    """Fuse a dq->matmul->q chain into a single quantized matmul op."""
+    assert anchor_node.target in (
+        torch.ops.aten.bmm.default,
+        torch.ops.aten.matmul.default,
+    ), f"Expected bmm/matmul, got {anchor_node.target}"
+    scale0 = get_arg(dq0, "scale", float)
+    scale1 = get_arg(dq1, "scale", float)
+    requantize_scale = (scale0 * scale1) / get_arg(quant_node, "scale", float)
+    requantize_scale_t = torch.tensor([requantize_scale])
+    out_multiplier, out_shift = quantize_tensor_multiplier(requantize_scale_t)
+    args = (
+        get_arg(dq0, "input", fx.Node),
+        get_arg(dq0, "zero_point", int),
+        get_arg(dq1, "input", fx.Node),
+        get_arg(dq1, "zero_point", int),
+        None,
+    )
+    kwargs = {
+        "out_multiplier": out_multiplier[0].item(),
+        "out_shift": out_shift[0].item(),
+        "out_zero_point": get_arg(quant_node, "zero_point", int),
+        "transposed": False,
+    }
+    return replace_with_op(gm, anchor_node, replacement_op, args, kwargs, quant_node)
diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
index 54c01227d07..e1f44b8ce5c 100644
--- a/backends/cadence/aot/quantizer/patterns.py
+++ b/backends/cadence/aot/quantizer/patterns.py
@@ -9,7 +9,7 @@
 import operator
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import List, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 from executorch.backends.cadence.aot.quantizer.utils import get_bias_qparams
@@ -79,6 +79,22 @@ def replacement_op(self) -> OpOverload:
         """
         pass
 
+    def anchor_ops(self) -> tuple[OpOverload, ...]:
+        return tuple(self.partition_types())
+
+    def fuse(
+        self,
+        gm: fx.GraphModule,
+        anchor_node: fx.Node,
+    ) -> Optional[fx.Node]:
+        """Replace the dq→op→q subgraph around ``anchor_node`` with a fused op.
+
+        Called by ``QuantFusionPass`` for each node matching ``anchor_ops()``.
+        Returns the new fused node on success, or ``None`` to skip this match.
+        Subclasses override to implement pattern-specific fusion logic.
+        """
+        return None
+
 
 class AddmmPattern(QuantizationPattern):
     def partition_types(self) -> List[OpOverload]:
diff --git a/backends/cadence/aot/quantizer/utils.py b/backends/cadence/aot/quantizer/utils.py
index 51182a4ce92..f5773938f0a 100644
--- a/backends/cadence/aot/quantizer/utils.py
+++ b/backends/cadence/aot/quantizer/utils.py
@@ -118,7 +118,9 @@ def create_zero_bias_int32(
     bias_scale: float,
 ) -> fx.Node:
     """
-    Creates a zero bias tensor with the shape of weight[0]
+    Creates a zero bias tensor with the shape of weight[0].
+    Caller is responsible for setting the graph insertion point
+    (e.g. ``with gm.graph.inserting_before(node):``).
     """
     try:
         attr_node = getattr(graph_module, weight_node.target)

From f1693c2ab45835f2372bb3b056c0c7e72c78c751 Mon Sep 17 00:00:00 2001
From: Suraj Raut <sraut@cadence.com>
Date: Fri, 29 May 2026 01:20:56 -0700
Subject: [PATCH 7/7] Reset submodule pointers to upstream/main

---
 backends/xnnpack/third-party/XNNPACK     | 2 +-
 backends/xnnpack/third-party/cpuinfo     | 2 +-
 backends/xnnpack/third-party/pthreadpool | 2 +-
 extension/llm/tokenizers                 | 2 +-
 shim                                     | 2 +-
 third-party/ao                           | 2 +-
 third-party/pocketfft                    | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK
index 3131afead79..1adaa7c709d 160000
--- a/backends/xnnpack/third-party/XNNPACK
+++ b/backends/xnnpack/third-party/XNNPACK
@@ -1 +1 @@
-Subproject commit 3131afead790c5c69a9aa12273dfc40399789ad7
+Subproject commit 1adaa7c709d4839d29e1f219cb962b01c9e6a905
diff --git a/backends/xnnpack/third-party/cpuinfo b/backends/xnnpack/third-party/cpuinfo
index 8a9210069b5..f9a03241f8c 160000
--- a/backends/xnnpack/third-party/cpuinfo
+++ b/backends/xnnpack/third-party/cpuinfo
@@ -1 +1 @@
-Subproject commit 8a9210069b5a37dd89ed118a783945502a30a4ae
+Subproject commit f9a03241f8c3d4ed0c9728f5d70bff873d43d4e0
diff --git a/backends/xnnpack/third-party/pthreadpool b/backends/xnnpack/third-party/pthreadpool
index c2ba5c50bb5..a56dcd79c69 160000
--- a/backends/xnnpack/third-party/pthreadpool
+++ b/backends/xnnpack/third-party/pthreadpool
@@ -1 +1 @@
-Subproject commit c2ba5c50bb58d1397b693740cf75fad836a0d1bf
+Subproject commit a56dcd79c699366e7ac6466792c3025883ff7704
diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers
index 3aada3fe28c..b642403834a 160000
--- a/extension/llm/tokenizers
+++ b/extension/llm/tokenizers
@@ -1 +1 @@
-Subproject commit 3aada3fe28c945d14d5ec62254eb56ccdf10eb11
+Subproject commit b642403834a67c8ef14a7109dcd1bb5e5f3cb68a
diff --git a/shim b/shim
index cf6a954aae4..b295819bb0e 160000
--- a/shim
+++ b/shim
@@ -1 +1 @@
-Subproject commit cf6a954aae4bee7b4515e13475878460115027d1
+Subproject commit b295819bb0ec636b4e3359828e05476d2437650a
diff --git a/third-party/ao b/third-party/ao
index 01849b2b19c..02105d46c61 160000
--- a/third-party/ao
+++ b/third-party/ao
@@ -1 +1 @@
-Subproject commit 01849b2b19cb923cb739a1fc02297ba418ddf715
+Subproject commit 02105d46c61dc80a8c9d39d5836e827ba3af8439
diff --git a/third-party/pocketfft b/third-party/pocketfft
index 0fa0ef591e3..81874074463 160000
--- a/third-party/pocketfft
+++ b/third-party/pocketfft
@@ -1 +1 @@
-Subproject commit 0fa0ef591e38c2758e3184c6c23e497b9f732ffa
+Subproject commit 8187407446316c3d16f15e5395dabd4b22f4fec7